⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 looptest.cpp

📁 数值计算工具库,C语言编写的,可以直接调用.
💻 CPP
📖 第 1 页 / 共 2 页
字号:

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    for, unroll=8, unit stride, constants loaded into temps"
         << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        double c2 = c;
        for (i=0; i < N; ++i)
            a[i] += c2 * b[i];

        double d2 = d;
        for (i=0; i < N; ++i)
            a[i] += d2 * b[i];
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "   for, indirection, unit stride, constants into temps"
         << endl;

    /*********************************************************************/
    
    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        int stride = 1;
        sink(stride);    // Prevent copy propagation

        for (i=0; i < N; i += stride)
            a[i] += c * b[i];

        for (i=0; i < N; i += stride)
            a[i] += d * b[i];
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    for, indirection, non-unit stride" << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        int stride = 1;
        sink(stride);    // Prevent copy propagation

        double c2 = c;
        for (i=0; i < N; i += stride)
            a[i] += c2 * b[i];

        double d2 = d;
        for (i=0; i < N; i += stride)
            a[i] += d2 * b[i];
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    for, indirection, non-unit stride, constants "
            "loaded into temps" << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        double * _bz_restrict pa1 = a,
               * _bz_restrict pb1 = b;
        double * _bz_restrict paend1 = a + N;
        while (pa1 != paend1)
        {
            *pa1 += c * (*pb1);
            ++pa1;
            ++pb1;
        }

        double * _bz_restrict pa2 = a,
               * _bz_restrict pb2 = b;
        double * _bz_restrict paend2 = a + N;
        while (pa2 != paend2)
        {
            *pa2 += d * (*pb2);
            ++pa2;
            ++pb2;
        }
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    while, pointer increment, unit stride" << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        double * _bz_restrict pa1 = a,
               * _bz_restrict pb1 = b;
        double * _bz_restrict paend1 = a + N;
        double c2 = c;
        while (pa1 != paend1)
        {
            *pa1 += c2 * (*pb1);
            ++pa1;
            ++pb1;
        }

        double * _bz_restrict pa2 = a,
               * _bz_restrict pb2 = b;
        double * _bz_restrict paend2 = a + N;
        double d2 = d;
        while (pa2 != paend2)
        {
            *pa2 += d2 * (*pb2);
            ++pa2;
            ++pb2;
        }
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    while, pointer increment, unit stride, " << endl
         << "    constants loaded into temps" 
         << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        int stride = 1;
        sink(stride);

        double * _bz_restrict pa1 = a,
               * _bz_restrict pb1 = b;
        double * _bz_restrict paend1 = a + N * stride;
        while (pa1 != paend1)
        {
            *pa1 += c * (*pb1);
            pa1 += stride;
            pb1 += stride;
        }

        double * _bz_restrict pa2 = a,
               * _bz_restrict pb2 = b;
        double * _bz_restrict paend2 = a + N * stride;
        while (pa2 != paend2)
        {
            *pa2 += d * (*pb2);
            pa2 += stride;
            pb2 += stride;
        }
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    while, pointer increment, non-unit stride" << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        int stride = 1;
        sink(stride);

        double * _bz_restrict pa1 = a,
               * _bz_restrict pb1 = b;
        double * _bz_restrict paend1 = a + N * stride;
        double c2 = c;
        int n1 = N & 3;

        for (i=0; i < n1; ++i)
        {
            *pa1 += c2 * (*pb1);
            pa1 += stride;
            pb1 += stride;
        }

        while (pa1 != paend1)
        {
            pa1[0] += c2 * pb1[0];
            pa1[1] += c2 * pb1[1];
            pa1[2] += c2 * pb1[2];
            pa1[3] += c2 * pb1[3];
            pa1 += 4 * stride;
            pb1 += 4 * stride;
        }

        double * _bz_restrict pa2 = a,
               * _bz_restrict pb2 = b;
        double * _bz_restrict paend2 = a + N * stride;
        double d2 = d;
        int n2 = N & 3;

        for (i=0; i < n2; ++i)
        {
            *pa2 += d2 * (*pb2);
            pa2 += stride;
            pb2 += stride;
        }

        while (pa2 != paend2)
        {
            pa2[0] += d2 * pb2[0];
            pa2[1] += d2 * pb2[1];
            pa2[2] += d2 * pb2[2];
            pa2[3] += d2 * pb2[3];
            pa2 += 4 * stride;
            pb2 += 4 * stride;
        }
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    while, pointer increment, unroll=4, non-unit stride," << endl
         << "     constants loaded into temps" << endl;

    /*********************************************************************/

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        double c2 = c;

        int n1 = N & 3;
        for (i=0; i < n1; ++i)
            a[i] += c2 * b[i];

        for (; i < N; i += 4)
        {
            t1 = a[i+4];
            a[i] += c2 * b[i];
            a[i+1] += c2 * b[i+1];
            t2 = b[i+4];
            a[i+2] += c2 * b[i+2];
            a[i+3] += c2 * b[i+3];
        }

        double d2 = d;
        int n2 = N & 3;
        for (i=0; i < n2; ++i)
            a[i] += d2 * b[i];

        for (; i < N; i += 4)
        {
            t1 = a[i+4];
            a[i] += d2 * b[i];
            a[i+1] += d2 * b[i+1];
            t2 = b[i+4];
            a[i+2] += d2 * b[i+2];
            a[i+3] += d2 * b[i+3];
        }
    }
    timer.stop();

    
    sink(t1);
    sink(t2);

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    for, unroll=4, unit stride, constants loaded into temps,"
         << "            prefetching"
         << endl;

    /********************************************************************/

    struct vectorPair {
        double a;
        double b;
    };
    vectorPair* v = new vectorPair[N];
    int N2 = 2*N;
    initialize(c, d, (double*)v, (double*)v, N2);

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        for (i=0; i < N; ++i)
            v[i].a += c * v[i].b;

        for (i=0; i < N; ++i)
            v[i].a += d * v[i].b;
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "   interlaced, for, indirection, unit stride" << endl;

    /*********************************************************************/

    initialize(c, d, (double*)v, (double*)v, N2);

    timer.start();
    for (iter=0; iter < iterations; ++iter)
    {
        double c2 = c;

        int n1 = N & 3;
        for (i=0; i < n1; ++i)
            v[i].a += c2 * v[i].b;

        for (; i < N; i += 4)
        {
            v[i].a += c2 * v[i].b;
            v[i+1].a += c2 * v[i+1].b;
            v[i+2].a += c2 * v[i+2].b;
            v[i+3].a += c2 * v[i+3].b;
        }

        double d2 = d;
        int n2 = N & 3;
        for (i=0; i < n2; ++i)
            v[i].a += d2 * v[i].b;

        for (; i < N; i += 4)
        {
            v[i].a += d2 * v[i].b;
            v[i+1].a += d2 * v[i+1].b;
            v[i+2].a += d2 * v[i+2].b;
            v[i+3].a += d2 * v[i+3].b;
        }
    }
    timer.stop();

    cout << setw(7) << setprecision(5) << (mflops/timer.elapsedSeconds())
         << "    for, unroll=4, unit stride, interlaced, " << endl
         << "\t\tconstants loaded into temps"
         << endl;

    delete [] v;

    /********************************************************************/

    delete [] a;
    delete [] b;
}

void initialize(double& c, double& d, double* a, double* b, int& N)
{
    for (int i=0; i < N; ++i)
    {
        a[i] = 1/7.; 
        b[i] = 1/3.;
    }
    c = 0.398192839842;
    d = - c;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -