⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_caxpy_x1_y1.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 2 页
字号:
            *u = ur0; ur1 += xr1 * ralpha; xi1 = X1[3]; X1 += 4;            *v = vr0; vr1 += xr2 * ralpha; xi2 = X2[3]; X2 += 4;            *z = zr0; zr1 += xr3 * ralpha; xi3 = X3[3]; X3 += 4;            y[1] = yi0; yi1 += xr0 * ialpha; yr0 = y[4];            u[1] = ui0; ui1 += xr1 * ialpha; ur0 = u[4];            v[1] = vi0; vi1 += xr2 * ialpha; vr0 = v[4];            z[1] = zi0; zi1 += xr3 * ialpha; zr0 = z[4];            yr1 MEQ xi0 * ialpha; yi0 = y[5];            ur1 MEQ xi1 * ialpha; ui0 = u[5];            vr1 MEQ xi2 * ialpha; vi0 = v[5];            zr1 MEQ xi3 * ialpha; zi0 = z[5];            yi1 PEQ xi0 * ralpha; xr0 = *x;            ui1 PEQ xi1 * ralpha; xr1 = *X1;            vi1 PEQ xi2 * ralpha; xr2 = *X2;            zi1 PEQ xi3 * ralpha; xr3 = *X3;            y[2] = yr1; yr0 += xr0 * ralpha; xi0 = x[1];            u[2] = ur1; ur0 += xr1 * ralpha; xi1 = X1[1];            v[2] = vr1; vr0 += xr2 * ralpha; xi2 = X2[1];            z[2] = zr1; zr0 += xr3 * ralpha; xi3 = X3[1]; X3 += 2;            y[3] = yi1; yi0 += xr0 * ialpha;            u[3] = ui1; ui0 += xr1 * ialpha;            v[3] = vi1; vi0 += xr2 * ialpha;            z[3] = zi1; zi0 += xr3 * ialpha;            yr0 MEQ xi0 * ialpha; y += 4;            ur0 MEQ xi1 * ialpha; u += 4;            vr0 MEQ xi2 * ialpha; v += 4;            zr0 MEQ xi3 * ialpha; z += 4;            yi0 PEQ xi0 * ralpha;            ui0 PEQ xi1 * ralpha;            vi0 PEQ xi2 * ralpha;            zi0 PEQ xi3 * ralpha;            *y = yr0;            *u = ur0;            *v = vr0;            *z = zr0;            y[1] = yi0;            u[1] = ui0;            v[1] = vi0;            z[1] = zi0; z += 2;      }      if (N-(n4<<2)) axpy_lt8(N-(n4<<2), alpha, X3, z);   }   else axpy_lt8(N, alpha, x, y);}   #undef PEQ   #undef MEQ#elif defined(ATL_NOMULADD) && ATL_mmnreg >= 26static void axpy_8(const int N, const SCALAR alpha, const TYPE *x, TYPE *y)/* * 8 register prefetch on X & Y, with 4 cycle multiply & 4 cycle add, * unrolled by 16 to ensure multiple cacheline usage for both singe & double */{   const register TYPE ralpha = *alpha, ialpha = alpha[1];   register TYPE xr0, xi0, xr1, xi1, xxr0, xxi0, xxr1, xxi1;   register TYPE yr0, yi0, yr1, yi1, yyr0, yyi0, yyr1, yyi1;   register TYPE m0, m1, m2, m3, a0, a1, a2, a3;   const TYPE *stX = x + (N<<1) - 16;   ATL_assert( (N == (N>>3)<<3) && N );   xr0  = *x;   xxr0 = x[8];   xi0  = x[1]; xxi0 = x[9];   xr1  = x[2]; xxr1 = x[10];   xi1  = x[3]; xxi1 = x[11];   yr0  = *y;   yyr0 = y[8];   yi0  = y[1]; yyi0 = y[9];   yr1  = y[2]; yyr1 = y[10];   yi1  = y[3]; yyi1 = y[11];   m0 = ralpha * xr0;   m1 = ralpha * xxr0;   m2 = ialpha * xr0; xr0  = x[4];   m3 = ialpha *xxr0; xxr0 = x[12];   a0 = yr0  + m0; m0 = ialpha *  xi0; yr0  = y[4];   a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[12];   a2 = yi0  + m2; m2 = ralpha *  xi0;  xi0  = x[5]; yi0  = y[5];   a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[13]; yyi0 = y[13];   #ifndef Conj_      a0 -= m0; m0 = ralpha * xr1;      a1 -= m1; m1 = ralpha * xxr1;      a2 += m2; m2 = ialpha *  xr1; xr1  = x[6];      a3 += m3; m3 = ialpha * xxr1; xxr1 = x[14];   #else      a0 += m0; m0 = ralpha * xr1;      a1 += m1; m1 = ralpha * xxr1;      a2 -= m2; m2 = ialpha *  xr1; xr1  = x[6];      a3 -= m3; m3 = ialpha * xxr1; xxr1 = x[14];   #endif   if (N != 8)   {      do      {         *y   = a0; a0 =  yr1 + m0; m0 = ialpha *  xi1;  yr1 = y[6];         y[8] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[14];         y[1] = a2; a2 =  yi1 + m2; m2 = ralpha *  xi1;  xi1 = x[7];                    yi1  = y[7];         y[9] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[15];                    yyi1 = y[15]; x += 16;         #ifndef Conj_            a0 -= m0; m0 = ralpha *  xr0;            a1 -= m1; m1 = ralpha * xxr0;            a2 += m2; m2 = ialpha *  xr0; xr0 = *x;            a3 += m3; m3 = ialpha * xxr0; xxr0 = x[8];         #else            a0 += m0; m0 = ralpha *  xr0;            a1 += m1; m1 = ralpha * xxr0;            a2 -= m2; m2 = ialpha *  xr0; xr0 = *x;            a3 -= m3; m3 = ialpha * xxr0; xxr0 = x[8];         #endif         y[ 2] = a0; a0 =  yr0 + m0; m0 = ialpha *  xi0; yr0  = y[16];         y[10] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[24];         y[ 3] = a2; a2 = yi0  + m2; m2 = ralpha *  xi0; xi0  = x[1];                     yi0  = y[17];         y[11] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[9];                     yyi0 = y[25];         #ifndef Conj_            a0 -= m0; m0 = ralpha *  xr1;            a1 -= m1; m1 = ralpha * xxr1;            a2 += m2; m2 = ialpha *  xr1; xr1  = x[2];            a3 += m3; m3 = ialpha * xxr1; xxr1 = x[10];         #else            a0 += m0; m0 = ralpha *  xr1;            a1 += m1; m1 = ralpha * xxr1;            a2 -= m2; m2 = ialpha *  xr1; xr1  = x[2];            a3 -= m3; m3 = ialpha * xxr1; xxr1 = x[10];         #endif         y[ 4] = a0; a0 =  yr1 + m0; m0 = ialpha *  xi1; yr1  = y[18];         y[12] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[26];         y[ 5] = a2; a2 = yi1  + m2; m2 = ralpha *  xi1; xi1  = x[3];                     yi1  = y[19];         y[13] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[11];                     yyi1 = y[27];         #ifndef Conj_            a0 -= m0; m0 = ralpha *  xr0;            a1 -= m1; m1 = ralpha * xxr0;            a2 += m2; m2 = ialpha *  xr0; xr0 = x[4];            a3 += m3; m3 = ialpha * xxr0; xxr0 = x[12];         #else            a0 += m0; m0 = ralpha *  xr0;            a1 += m1; m1 = ralpha * xxr0;            a2 -= m2; m2 = ialpha *  xr0; xr0 = x[4];            a3 -= m3; m3 = ialpha * xxr0; xxr0 = x[12];         #endif         y[ 6] = a0; a0 =  yr0 + m0; m0 = ialpha *  xi0; yr0  = y[20];         y[14] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[28];         y[ 7] = a2; a2 = yi0  + m2; m2 = ralpha *  xi0; xi0  = x[5];                     yi0  = y[21];         y[15] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[13];                     yyi0 = y[29];         y += 16;         #ifndef Conj_            a0 -= m0; m0 = ralpha *  xr1;            a1 -= m1; m1 = ralpha * xxr1;            a2 += m2; m2 = ialpha *  xr1; xr1  = x[6];            a3 += m3; m3 = ialpha * xxr1; xxr1 = x[14];         #else            a0 += m0; m0 = ralpha *  xr1;            a1 += m1; m1 = ralpha * xxr1;            a2 -= m2; m2 = ialpha *  xr1; xr1  = x[6];            a3 -= m3; m3 = ialpha * xxr1; xxr1 = x[14];         #endif      }      while (x != stX);   }/* * Drain pipe, store last 8 elts of Y */   *y   = a0; a0 =  yr1 + m0; m0 = ialpha *  xi1;  yr1 = y[6];   y[8] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[14];   y[1] = a2; a2 =  yi1 + m2; m2 = ralpha *  xi1;  xi1 = x[7]; yi1  = y[7];   y[9] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[15]; yyi1 = y[15];   #ifndef Conj_      a0 -= m0; m0 = ralpha *  xr0;      a1 -= m1; m1 = ralpha * xxr0;      a2 += m2; m2 = ialpha *  xr0;      a3 += m3; m3 = ialpha * xxr0;   #else      a0 += m0; m0 = ralpha *  xr0;      a1 += m1; m1 = ralpha * xxr0;      a2 -= m2; m2 = ialpha *  xr0;      a3 -= m3; m3 = ialpha * xxr0;   #endif   y[ 2] = a0; a0 =  yr0 + m0; m0 = ialpha *  xi0;   y[10] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0;   y[ 3] = a2; a2 = yi0  + m2; m2 = ralpha *  xi0;   y[11] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0;   #ifndef Conj_      a0 -= m0; m0 = ralpha *  xr1;      a1 -= m1; m1 = ralpha * xxr1;      a2 += m2; m2 = ialpha *  xr1;      a3 += m3; m3 = ialpha * xxr1;   #else      a0 += m0; m0 = ralpha *  xr1;      a1 += m1; m1 = ralpha * xxr1;      a2 -= m2; m2 = ialpha *  xr1;      a3 -= m3; m3 = ialpha * xxr1;   #endif   y[ 4] = a0; a0 =  yr1 + m0; m0 = ialpha *  xi1;   y[12] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1;   y[ 5] = a2; a2 = yi1  + m2; m2 = ralpha *  xi1;   y[13] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1;   #ifndef Conj_      a0 -= m0;      a1 -= m1;      a2 += m2;      a3 += m3;   #else      a0 += m0;      a1 += m1;      a2 -= m2;      a3 -= m3;   #endif   y[ 6] = a0;   y[14] = a1;   y[ 7] = a2;   y[15] = a3;}#endif#ifdef Conj_void Mjoin(PATL,axpyConj_x1_y1)/* * y <- alpha * Conj(x) + y */#elsevoid Mjoin(PATL,axpy_x1_y1)/* * y <- alpha * x + y */#endif   (const int N, const SCALAR alpha, const TYPE *X, const int incX,    TYPE *Y, const int incY){#if defined(ATL_NOMULADD) && ATL_mmnreg >= 26   const int n8 = (N>>3)<<3, nr = N - n8;   if (n8)   {      axpy_8(n8, alpha, X, Y);      X += n8<<1;      Y += n8<<1;   }   if (nr) axpy_lt8(nr, alpha, X, Y);#elif defined (ATL_MULADD) && ATL_mmnreg >= 26   axpy_8(N, alpha, X, Y);#endif}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -