📄 looptest.cpp
字号:
#include <blitz/timer.h>
BZ_USING_NAMESPACE(blitz)
void initialize(double& c, double& d, double* a, double* b, int& N);
template<class T>
void sink(T&)
{ }
void benchmarkLoops(int, long);
int main()
{
cout << "This program measures the performance of DAXPY operations"
<< endl << "using various C loop structures." << endl << endl;
cout << endl << "In-cache:" << endl;
benchmarkLoops(400,50000);
cout << endl << "Out of cache:" << endl;
benchmarkLoops(1000000,50);
return 0;
}
void benchmarkLoops(int N, long iterations)
{
double* _bz_restrict a = new double[N];
double* _bz_restrict b = new double[N];
double c, d;
double t1, t2;
initialize(c, d, a, b, N);
double mflops = iterations * 4.0 * N / (1024.0 * 1024.0);
Timer timer;
cout << "Mflops/s Description" << endl;
long iter;
int i;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
for (i=0; i < N; ++i)
a[i] += c * b[i];
for (i=0; i < N; ++i)
a[i] += d * b[i];
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, indirection, unit stride" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
for (i=0; i < N; ++i)
a[i] = a[i] + c * b[i];
for (i=0; i < N; ++i)
a[i] = a[i] + d * b[i];
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, indirection, unit stride, no +=" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
for (i=N-1; i >= 0; --i)
a[i] += c * b[i];
for (i=N-1; i >= 0; --i)
a[i] += d * b[i];
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, indirection, unit stride, backwards loops" << endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
for (; i < N; i += 4)
{
a[i] += c2 * b[i];
a[i+1] += c2 * b[i+1];
a[i+2] += c2 * b[i+2];
a[i+3] += c2 * b[i+3];
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
for (; i < N; i += 4)
{
a[i] += d2 * b[i];
a[i+1] += d2 * b[i+1];
a[i+2] += d2 * b[i+2];
a[i+3] += d2 * b[i+3];
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, constants loaded into temps"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
for (; i < N; i += 4)
{
double t1 = c2 * b[i];
double t2 = c2 * b[i+1];
double t3 = c2 * b[i+2];
double t4 = c2 * b[i+3];
a[i] += t1;
a[i+1] += t2;
a[i+2] += t3;
a[i+3] += t4;
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
for (; i < N; i += 4)
{
double t1 = d2 * b[i];
double t2 = d2 * b[i+1];
double t3 = d2 * b[i+2];
double t4 = d2 * b[i+3];
a[i] += t1;
a[i+1] += t2;
a[i+2] += t3;
a[i+3] += t4;
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, constants loaded into temps,"
<< endl << "\t\t4 read then 4 write"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
for (; i < N; i += 4)
{
a[i] = a[i] + c2 * b[i];
a[i+1] = a[i+1] + c2 * b[i+1];
a[i+2] = a[i+2] + c2 * b[i+2];
a[i+3] = a[i+3] + c2 * b[i+3];
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
for (; i < N; i += 4)
{
a[i] = a[i] + d2 * b[i];
a[i+1] = a[i+1] + d2 * b[i+1];
a[i+2] = a[i+2] + d2 * b[i+2];
a[i+3] = a[i+3] + d2 * b[i+3];
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, constants loaded into temps,"
<< endl << " no += "
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
for (; i < N; i += 4)
{
int i1 = i + 1;
a[i] += c2 * b[i];
int i2 = i + 2;
a[i1] += c2 * b[i1];
int i3 = i + 3;
a[i2] += c2 * b[i2];
a[i3] += c2 * b[i3];
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
for (; i < N; i += 4)
{
int i1 = i + 1;
a[i] += d2 * b[i];
int i2 = i + 2;
a[i1] += d2 * b[i1];
int i3 = i + 3;
a[i2] += d2 * b[i2];
a[i3] += d2 * b[i3];
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, constants loaded into temps,"
<< endl << " CSE for index offsets"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 3;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
double* pa = a+n1;
double* pb = b+n1;
int top = N - n1 - 4;
for (i=top; i >= 0; i -= 4)
{
pa[i] += c2 * pb[i];
pa[i+1] += c2 * pb[i+1];
pa[i+2] += c2 * pb[i+2];
pa[i+3] += c2 * pb[i+3];
}
double d2 = d;
int n2 = N & 3;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
pa = a+n2;
pb = b+n2;
top = N - n2 - 4;
for (i=top; i >= 0; i -= 4)
{
pa[i] += d2 * pb[i];
pa[i+1] += d2 * pb[i+1];
pa[i+2] += d2 * pb[i+2];
pa[i+3] += d2 * pb[i+3];
}
}
timer.stop();
cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
<< " for, unroll=4, unit stride, constants loaded into temps,"
<< " backwards"
<< endl;
/*********************************************************************/
timer.start();
for (iter=0; iter < iterations; ++iter)
{
double c2 = c;
int n1 = N & 7;
for (i=0; i < n1; ++i)
a[i] += c2 * b[i];
for (; i < N; i += 8)
{
a[i] += c2 * b[i];
a[i+1] += c2 * b[i+1];
a[i+2] += c2 * b[i+2];
a[i+3] += c2 * b[i+3];
a[i+4] += c2 * b[i+4];
a[i+5] += c2 * b[i+5];
a[i+6] += c2 * b[i+6];
a[i+7] += c2 * b[i+7];
}
double d2 = d;
int n2 = N & 7;
for (i=0; i < n2; ++i)
a[i] += d2 * b[i];
for (; i < N; i += 8)
{
a[i] += d2 * b[i];
a[i+1] += d2 * b[i+1];
a[i+2] += d2 * b[i+2];
a[i+3] += d2 * b[i+3];
a[i+4] += d2 * b[i+4];
a[i+5] += d2 * b[i+5];
a[i+6] += d2 * b[i+6];
a[i+7] += d2 * b[i+7];
}
}
timer.stop();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -