📄 looptest.cpp
字号:
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=8, unit stride, constants loaded into temps" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; for (i=0; i < N; ++i) a[i] += c2 * b[i]; double d2 = d; for (i=0; i < N; ++i) a[i] += d2 * b[i]; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, indirection, unit stride, constants into temps" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { int stride = 1; sink(stride); // Prevent copy propagation for (i=0; i < N; i += stride) a[i] += c * b[i]; for (i=0; i < N; i += stride) a[i] += d * b[i]; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, indirection, non-unit stride" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { int stride = 1; sink(stride); // Prevent copy propagation double c2 = c; for (i=0; i < N; i += stride) a[i] += c2 * b[i]; double d2 = d; for (i=0; i < N; i += stride) a[i] += d2 * b[i]; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, indirection, non-unit stride, constants " "loaded into temps" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double * _bz_restrict pa1 = a, * _bz_restrict pb1 = b; double * _bz_restrict paend1 = a + N; while (pa1 != paend1) { *pa1 += c * (*pb1); ++pa1; ++pb1; } double * _bz_restrict pa2 = a, * _bz_restrict pb2 = b; double * _bz_restrict paend2 = a + N; while (pa2 != paend2) { *pa2 += d * (*pb2); ++pa2; ++pb2; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " while, pointer increment, unit stride" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double * _bz_restrict pa1 = a, * _bz_restrict pb1 = b; double * _bz_restrict paend1 = a + N; double c2 = c; while (pa1 != paend1) { *pa1 += c2 * (*pb1); ++pa1; ++pb1; } double * _bz_restrict pa2 = a, * _bz_restrict pb2 = b; double * _bz_restrict paend2 = a + N; double d2 = d; while (pa2 != paend2) { *pa2 += d2 * (*pb2); ++pa2; ++pb2; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " while, pointer increment, unit stride, " << endl << " constants loaded into temps" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { int stride = 1; sink(stride); double * _bz_restrict pa1 = a, * _bz_restrict pb1 = b; double * _bz_restrict paend1 = a + N * stride; while (pa1 != paend1) { *pa1 += c * (*pb1); pa1 += stride; pb1 += stride; } double * _bz_restrict pa2 = a, * _bz_restrict pb2 = b; double * _bz_restrict paend2 = a + N * stride; while (pa2 != paend2) { *pa2 += d * (*pb2); pa2 += stride; pb2 += stride; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " while, pointer increment, non-unit stride" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { int stride = 1; sink(stride); double * _bz_restrict pa1 = a, * _bz_restrict pb1 = b; double * _bz_restrict paend1 = a + N * stride; double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) { *pa1 += c2 * (*pb1); pa1 += stride; pb1 += stride; } while (pa1 != paend1) { pa1[0] += c2 * pb1[0]; pa1[1] += c2 * pb1[1]; pa1[2] += c2 * pb1[2]; pa1[3] += c2 * pb1[3]; pa1 += 4 * stride; pb1 += 4 * stride; } double * _bz_restrict pa2 = a, * _bz_restrict pb2 = b; double * _bz_restrict paend2 = a + N * stride; double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) { *pa2 += d2 * (*pb2); pa2 += stride; pb2 += stride; } while (pa2 != paend2) { pa2[0] += d2 * pb2[0]; pa2[1] += d2 * pb2[1]; pa2[2] += d2 * pb2[2]; pa2[3] += d2 * pb2[3]; pa2 += 4 * stride; pb2 += 4 * stride; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " while, pointer increment, unroll=4, non-unit stride," << endl << " constants loaded into temps" << endl; /*********************************************************************/ timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) a[i] += c2 * b[i]; for (; i < N; i += 4) { t1 = a[i+4]; a[i] += c2 * b[i]; a[i+1] += c2 * b[i+1]; t2 = b[i+4]; a[i+2] += c2 * b[i+2]; a[i+3] += c2 * b[i+3]; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) a[i] += d2 * b[i]; for (; i < N; i += 4) { t1 = a[i+4]; a[i] += d2 * b[i]; a[i+1] += d2 * b[i+1]; t2 = b[i+4]; a[i+2] += d2 * b[i+2]; a[i+3] += d2 * b[i+3]; } } timer.stop(); sink(t1); sink(t2); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, constants loaded into temps," << " prefetching" << endl; /********************************************************************/ struct vectorPair { double a; double b; }; vectorPair* v = new vectorPair[N]; int N2 = 2*N; initialize(c, d, (double*)v, (double*)v, N2); timer.start(); for (iter=0; iter < iterations; ++iter) { for (i=0; i < N; ++i) v[i].a += c * v[i].b; for (i=0; i < N; ++i) v[i].a += d * v[i].b; } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " interlaced, for, indirection, unit stride" << endl; /*********************************************************************/ initialize(c, d, (double*)v, (double*)v, N2); timer.start(); for (iter=0; iter < iterations; ++iter) { double c2 = c; int n1 = N & 3; for (i=0; i < n1; ++i) v[i].a += c2 * v[i].b; for (; i < N; i += 4) { v[i].a += c2 * v[i].b; v[i+1].a += c2 * v[i+1].b; v[i+2].a += c2 * v[i+2].b; v[i+3].a += c2 * v[i+3].b; } double d2 = d; int n2 = N & 3; for (i=0; i < n2; ++i) v[i].a += d2 * v[i].b; for (; i < N; i += 4) { v[i].a += d2 * v[i].b; v[i+1].a += d2 * v[i+1].b; v[i+2].a += d2 * v[i+2].b; v[i+3].a += d2 * v[i+3].b; } } timer.stop(); cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds()) << " for, unroll=4, unit stride, interlaced, " << endl << "\t\tconstants loaded into temps" << endl; delete [] v; /********************************************************************/ delete [] a; delete [] b;}void initialize(double& c, double& d, double* a, double* b, int& N){ for (int i=0; i < N; ++i) { a[i] = 1/7.; b[i] = 1/3.; } c = 0.398192839842; d = - c;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -