📄 looptest.cpp
字号:
#include <blitz/timer.h>BZ_USING_NAMESPACE(blitz)void initialize(double& c, double& d, double* a, double* b, int& N);template<class T>void sink(T&){ }void benchmarkLoops(int, long);int main(){ cout << "This program measures the performance of DAXPY operations" << endl << "using various C loop structures." << endl << endl; cout << endl << "In-cache:" << endl; benchmarkLoops(400,50000); cout << endl << "Out of cache:" << endl; benchmarkLoops(1000000,50); return 0;}void benchmarkLoops(int N, long iterations){ double* _bz_restrict a = new double[N]; double* _bz_restrict b = new double[N]; double c, d; double t1, t2; initialize(c, d, a, b, N); double mflops = iterations * 4.0 * N / (1024.0 * 1024.0); Timer timer; cout << "Mflops/s Description" << endl; long iter; int i; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { for (i=0; i < N; ++i) a[i] += c * b[i]; for (i=0; i < N; ++i) a[i] += d * b[i]; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, indirection, unit stride" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { for (i=0; i < N; ++i) a[i] = a[i] + c * b[i]; for (i=0; i < N; ++i) a[i] = a[i] + d * b[i]; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, indirection, unit stride, no +=" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { for (i=N-1; i >= 0; --i) a[i] += c * b[i]; for (i=N-1; i >= 0; --i) a[i] += d * b[i]; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, indirection, unit stride, backwards loops" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; for (; i < N; i += 4) { a[i] += c2 * b[i]; a[i+1] += c2 * b[i+1]; a[i+2] += c2 * b[i+2]; a[i+3] += c2 * b[i+3]; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; for (; i < N; i += 4) { a[i] += d2 * b[i]; a[i+1] += d2 * b[i+1]; a[i+2] += d2 * b[i+2]; a[i+3] += d2 * b[i+3]; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, constants loaded into temps" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; for (; i < N; i += 4) { double t1 = c2 * b[i]; double t2 = c2 * b[i+1]; double t3 = c2 * b[i+2]; double t4 = c2 * b[i+3]; a[i] += t1; a[i+1] += t2; a[i+2] += t3; a[i+3] += t4; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; for (; i < N; i += 4) { double t1 = d2 * b[i]; double t2 = d2 * b[i+1]; double t3 = d2 * b[i+2]; double t4 = d2 * b[i+3]; a[i] += t1; a[i+1] += t2; a[i+2] += t3; a[i+3] += t4; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, constants loaded into temps," << endl << "\t\t4 read then 4 write" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; for (; i < N; i += 4) { a[i] = a[i] + c2 * b[i]; a[i+1] = a[i+1] + c2 * b[i+1]; a[i+2] = a[i+2] + c2 * b[i+2]; a[i+3] = a[i+3] + c2 * b[i+3]; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; for (; i < N; i += 4) { a[i] = a[i] + d2 * b[i]; a[i+1] = a[i+1] + d2 * b[i+1]; a[i+2] = a[i+2] + d2 * b[i+2]; a[i+3] = a[i+3] + d2 * b[i+3]; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, constants loaded into temps," << endl << " no += " << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; for (; i < N; i += 4) { int i1 = i + 1; a[i] += c2 * b[i]; int i2 = i + 2; a[i1] += c2 * b[i1]; int i3 = i + 3; a[i2] += c2 * b[i2]; a[i3] += c2 * b[i3]; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; for (; i < N; i += 4) { int i1 = i + 1; a[i] += d2 * b[i]; int i2 = i + 2; a[i1] += d2 * b[i1]; int i3 = i + 3; a[i2] += d2 * b[i2]; a[i3] += d2 * b[i3]; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, constants loaded into temps," << endl << " CSE for index offsets" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; double* pa = a+n1; double* pb = b+n1; int top = N - n1 - 4; for (i=top; i >= 0; i -= 4) { pa[i] += c2 * pb[i]; pa[i+1] += c2 * pb[i+1]; pa[i+2] += c2 * pb[i+2]; pa[i+3] += c2 * pb[i+3]; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; pa = a+n2; pb = b+n2; top = N - n2 - 4; for (i=top; i >= 0; i -= 4) { pa[i] += d2 * pb[i]; pa[i+1] += d2 * pb[i+1]; pa[i+2] += d2 * pb[i+2]; pa[i+3] += d2 * pb[i+3]; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, constants loaded into temps," << " backwards" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 7; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; for (; i < N; i += 8) { a[i] += c2 * b[i]; a[i+1] += c2 * b[i+1]; a[i+2] += c2 * b[i+2]; a[i+3] += c2 * b[i+3]; a[i+4] += c2 * b[i+4]; a[i+5] += c2 * b[i+5]; a[i+6] += c2 * b[i+6]; a[i+7] += c2 * b[i+7]; } double d2 = d; int n2 = N & 7; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; for (; i < N; i += 8) { a[i] += d2 * b[i]; a[i+1] += d2 * b[i+1]; a[i+2] += d2 * b[i+2]; a[i+3] += d2 * b[i+3]; a[i+4] += d2 * b[i+4]; a[i+5] += d2 * b[i+5]; a[i+6] += d2 * b[i+6]; a[i+7] += d2 * b[i+7]; } } timer.stop();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -