📄 looptest.cpp
字号:
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=8, unit stride, constants loaded into temps"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
for (i=0; i < N; ++i)
a[i] += c2 * b[i];
double d2 = d;
for (i=0; i < N; ++i)
a[i] += d2 * b[i];
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, indirection, unit stride, constants into temps"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
int stride = 1;
sink(stride); // Prevent copy propagation
for (i=0; i < N; i += stride)
a[i] += c * b[i];
for (i=0; i < N; i += stride)
a[i] += d * b[i];
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, indirection, non-unit stride" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
int stride = 1;
sink(stride); // Prevent copy propagation
double c2 = c;
for (i=0; i < N; i += stride)
a[i] += c2 * b[i];
double d2 = d;
for (i=0; i < N; i += stride)
a[i] += d2 * b[i];
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, indirection, non-unit stride, constants "
"loaded into temps" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double * _bz_restrict pa1 = a,
* _bz_restrict pb1 = b;
double * _bz_restrict paend1 = a + N;
while (pa1 != paend1)
{
*pa1 += c * (*pb1);
++pa1;
++pb1;
}
double * _bz_restrict pa2 = a,
* _bz_restrict pb2 = b;
double * _bz_restrict paend2 = a + N;
while (pa2 != paend2)
{
*pa2 += d * (*pb2);
++pa2;
++pb2;
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " while, pointer increment, unit stride" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double * _bz_restrict pa1 = a,
* _bz_restrict pb1 = b;
double * _bz_restrict paend1 = a + N;
double c2 = c;
while (pa1 != paend1)
{
*pa1 += c2 * (*pb1);
++pa1;
++pb1;
}
double * _bz_restrict pa2 = a,
* _bz_restrict pb2 = b;
double * _bz_restrict paend2 = a + N;
double d2 = d;
while (pa2 != paend2)
{
*pa2 += d2 * (*pb2);
++pa2;
++pb2;
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " while, pointer increment, unit stride, " << endl
<< " constants loaded into temps"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
int stride = 1;
sink(stride);
double * _bz_restrict pa1 = a,
* _bz_restrict pb1 = b;
double * _bz_restrict paend1 = a + N * stride;
while (pa1 != paend1)
{
*pa1 += c * (*pb1);
pa1 += stride;
pb1 += stride;
}
double * _bz_restrict pa2 = a,
* _bz_restrict pb2 = b;
double * _bz_restrict paend2 = a + N * stride;
while (pa2 != paend2)
{
*pa2 += d * (*pb2);
pa2 += stride;
pb2 += stride;
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " while, pointer increment, non-unit stride" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
int stride = 1;
sink(stride);
double * _bz_restrict pa1 = a,
* _bz_restrict pb1 = b;
double * _bz_restrict paend1 = a + N * stride;
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
{
*pa1 += c2 * (*pb1);
pa1 += stride;
pb1 += stride;
}
while (pa1 != paend1)
{
pa1[0] += c2 * pb1[0];
pa1[1] += c2 * pb1[1];
pa1[2] += c2 * pb1[2];
pa1[3] += c2 * pb1[3];
pa1 += 4 * stride;
pb1 += 4 * stride;
}
double * _bz_restrict pa2 = a,
* _bz_restrict pb2 = b;
double * _bz_restrict paend2 = a + N * stride;
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
{
*pa2 += d2 * (*pb2);
pa2 += stride;
pb2 += stride;
}
while (pa2 != paend2)
{
pa2[0] += d2 * pb2[0];
pa2[1] += d2 * pb2[1];
pa2[2] += d2 * pb2[2];
pa2[3] += d2 * pb2[3];
pa2 += 4 * stride;
pb2 += 4 * stride;
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " while, pointer increment, unroll=4, non-unit stride," << endl
<< " constants loaded into temps" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
for (; i < N; i += 4)
{
t1 = a[i+4];
a[i] += c2 * b[i];
a[i+1] += c2 * b[i+1];
t2 = b[i+4];
a[i+2] += c2 * b[i+2];
a[i+3] += c2 * b[i+3];
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
for (; i < N; i += 4)
{
t1 = a[i+4];
a[i] += d2 * b[i];
a[i+1] += d2 * b[i+1];
t2 = b[i+4];
a[i+2] += d2 * b[i+2];
a[i+3] += d2 * b[i+3];
}
}
timer.stop();
sink(t1);
sink(t2);
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, constants loaded into temps,"
<< " prefetching"
<< endl;
/********************************************************************/
struct vectorPair {
double a;
double b;
};
vectorPair* v = new vectorPair[N];
int N2 = 2*N;
initialize(c, d, (double*)v, (double*)v, N2);
timer.start();
for (iter=0; iter < iterations; ++iter)
{
for (i=0; i < N; ++i)
v[i].a += c * v[i].b;
for (i=0; i < N; ++i)
v[i].a += d * v[i].b;
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " interlaced, for, indirection, unit stride" << endl;
/*********************************************************************/
initialize(c, d, (double*)v, (double*)v, N2);
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
v[i].a += c2 * v[i].b;
for (; i < N; i += 4)
{
v[i].a += c2 * v[i].b;
v[i+1].a += c2 * v[i+1].b;
v[i+2].a += c2 * v[i+2].b;
v[i+3].a += c2 * v[i+3].b;
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
v[i].a += d2 * v[i].b;
for (; i < N; i += 4)
{
v[i].a += d2 * v[i].b;
v[i+1].a += d2 * v[i+1].b;
v[i+2].a += d2 * v[i+2].b;
v[i+3].a += d2 * v[i+3].b;
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, interlaced, " << endl
<< "\t\tconstants loaded into temps"
<< endl;
delete [] v;
/********************************************************************/
delete [] a;
delete [] b;
}
void initialize(double& c, double& d, double* a, double* b, int& N)
{
for (int i=0; i < N; ++i)
{
a[i] = 1/7.;
b[i] = 1/3.;
}
c = 0.398192839842;
d = - c;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -