📄 atl_caxpy_x1_y1.c
字号:
*u = ur0; ur1 += xr1 * ralpha; xi1 = X1[3]; X1 += 4; *v = vr0; vr1 += xr2 * ralpha; xi2 = X2[3]; X2 += 4; *z = zr0; zr1 += xr3 * ralpha; xi3 = X3[3]; X3 += 4; y[1] = yi0; yi1 += xr0 * ialpha; yr0 = y[4]; u[1] = ui0; ui1 += xr1 * ialpha; ur0 = u[4]; v[1] = vi0; vi1 += xr2 * ialpha; vr0 = v[4]; z[1] = zi0; zi1 += xr3 * ialpha; zr0 = z[4]; yr1 MEQ xi0 * ialpha; yi0 = y[5]; ur1 MEQ xi1 * ialpha; ui0 = u[5]; vr1 MEQ xi2 * ialpha; vi0 = v[5]; zr1 MEQ xi3 * ialpha; zi0 = z[5]; yi1 PEQ xi0 * ralpha; xr0 = *x; ui1 PEQ xi1 * ralpha; xr1 = *X1; vi1 PEQ xi2 * ralpha; xr2 = *X2; zi1 PEQ xi3 * ralpha; xr3 = *X3; y[2] = yr1; yr0 += xr0 * ralpha; xi0 = x[1]; u[2] = ur1; ur0 += xr1 * ralpha; xi1 = X1[1]; v[2] = vr1; vr0 += xr2 * ralpha; xi2 = X2[1]; z[2] = zr1; zr0 += xr3 * ralpha; xi3 = X3[1]; X3 += 2; y[3] = yi1; yi0 += xr0 * ialpha; u[3] = ui1; ui0 += xr1 * ialpha; v[3] = vi1; vi0 += xr2 * ialpha; z[3] = zi1; zi0 += xr3 * ialpha; yr0 MEQ xi0 * ialpha; y += 4; ur0 MEQ xi1 * ialpha; u += 4; vr0 MEQ xi2 * ialpha; v += 4; zr0 MEQ xi3 * ialpha; z += 4; yi0 PEQ xi0 * ralpha; ui0 PEQ xi1 * ralpha; vi0 PEQ xi2 * ralpha; zi0 PEQ xi3 * ralpha; *y = yr0; *u = ur0; *v = vr0; *z = zr0; y[1] = yi0; u[1] = ui0; v[1] = vi0; z[1] = zi0; z += 2; } if (N-(n4<<2)) axpy_lt8(N-(n4<<2), alpha, X3, z); } else axpy_lt8(N, alpha, x, y);} #undef PEQ #undef MEQ#elif defined(ATL_NOMULADD) && ATL_mmnreg >= 26static void axpy_8(const int N, const SCALAR alpha, const TYPE *x, TYPE *y)/* * 8 register prefetch on X & Y, with 4 cycle multiply & 4 cycle add, * unrolled by 16 to ensure multiple cacheline usage for both singe & double */{ const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE xr0, xi0, xr1, xi1, xxr0, xxi0, xxr1, xxi1; register TYPE yr0, yi0, yr1, yi1, yyr0, yyi0, yyr1, yyi1; register TYPE m0, m1, m2, m3, a0, a1, a2, a3; const TYPE *stX = x + (N<<1) - 16; ATL_assert( (N == (N>>3)<<3) && N ); xr0 = *x; xxr0 = x[8]; xi0 = x[1]; xxi0 = x[9]; xr1 = x[2]; xxr1 = x[10]; xi1 = x[3]; xxi1 = x[11]; yr0 = *y; yyr0 = y[8]; yi0 = y[1]; yyi0 = y[9]; yr1 = y[2]; yyr1 = y[10]; yi1 = y[3]; yyi1 = y[11]; m0 = ralpha * xr0; m1 = ralpha * xxr0; m2 = ialpha * xr0; xr0 = x[4]; m3 = ialpha *xxr0; xxr0 = x[12]; a0 = yr0 + m0; m0 = ialpha * xi0; yr0 = y[4]; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[12]; a2 = yi0 + m2; m2 = ralpha * xi0; xi0 = x[5]; yi0 = y[5]; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[13]; yyi0 = y[13]; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; xr1 = x[6]; a3 += m3; m3 = ialpha * xxr1; xxr1 = x[14]; #else a0 += m0; m0 = ralpha * xr1; a1 += m1; m1 = ralpha * xxr1; a2 -= m2; m2 = ialpha * xr1; xr1 = x[6]; a3 -= m3; m3 = ialpha * xxr1; xxr1 = x[14]; #endif if (N != 8) { do { *y = a0; a0 = yr1 + m0; m0 = ialpha * xi1; yr1 = y[6]; y[8] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[14]; y[1] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; xi1 = x[7]; yi1 = y[7]; y[9] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[15]; yyi1 = y[15]; x += 16; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr0; a1 -= m1; m1 = ralpha * xxr0; a2 += m2; m2 = ialpha * xr0; xr0 = *x; a3 += m3; m3 = ialpha * xxr0; xxr0 = x[8]; #else a0 += m0; m0 = ralpha * xr0; a1 += m1; m1 = ralpha * xxr0; a2 -= m2; m2 = ialpha * xr0; xr0 = *x; a3 -= m3; m3 = ialpha * xxr0; xxr0 = x[8]; #endif y[ 2] = a0; a0 = yr0 + m0; m0 = ialpha * xi0; yr0 = y[16]; y[10] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[24]; y[ 3] = a2; a2 = yi0 + m2; m2 = ralpha * xi0; xi0 = x[1]; yi0 = y[17]; y[11] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[9]; yyi0 = y[25]; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; xr1 = x[2]; a3 += m3; m3 = ialpha * xxr1; xxr1 = x[10]; #else a0 += m0; m0 = ralpha * xr1; a1 += m1; m1 = ralpha * xxr1; a2 -= m2; m2 = ialpha * xr1; xr1 = x[2]; a3 -= m3; m3 = ialpha * xxr1; xxr1 = x[10]; #endif y[ 4] = a0; a0 = yr1 + m0; m0 = ialpha * xi1; yr1 = y[18]; y[12] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[26]; y[ 5] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; xi1 = x[3]; yi1 = y[19]; y[13] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[11]; yyi1 = y[27]; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr0; a1 -= m1; m1 = ralpha * xxr0; a2 += m2; m2 = ialpha * xr0; xr0 = x[4]; a3 += m3; m3 = ialpha * xxr0; xxr0 = x[12]; #else a0 += m0; m0 = ralpha * xr0; a1 += m1; m1 = ralpha * xxr0; a2 -= m2; m2 = ialpha * xr0; xr0 = x[4]; a3 -= m3; m3 = ialpha * xxr0; xxr0 = x[12]; #endif y[ 6] = a0; a0 = yr0 + m0; m0 = ialpha * xi0; yr0 = y[20]; y[14] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[28]; y[ 7] = a2; a2 = yi0 + m2; m2 = ralpha * xi0; xi0 = x[5]; yi0 = y[21]; y[15] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[13]; yyi0 = y[29]; y += 16; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; xr1 = x[6]; a3 += m3; m3 = ialpha * xxr1; xxr1 = x[14]; #else a0 += m0; m0 = ralpha * xr1; a1 += m1; m1 = ralpha * xxr1; a2 -= m2; m2 = ialpha * xr1; xr1 = x[6]; a3 -= m3; m3 = ialpha * xxr1; xxr1 = x[14]; #endif } while (x != stX); }/* * Drain pipe, store last 8 elts of Y */ *y = a0; a0 = yr1 + m0; m0 = ialpha * xi1; yr1 = y[6]; y[8] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[14]; y[1] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; xi1 = x[7]; yi1 = y[7]; y[9] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[15]; yyi1 = y[15]; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr0; a1 -= m1; m1 = ralpha * xxr0; a2 += m2; m2 = ialpha * xr0; a3 += m3; m3 = ialpha * xxr0; #else a0 += m0; m0 = ralpha * xr0; a1 += m1; m1 = ralpha * xxr0; a2 -= m2; m2 = ialpha * xr0; a3 -= m3; m3 = ialpha * xxr0; #endif y[ 2] = a0; a0 = yr0 + m0; m0 = ialpha * xi0; y[10] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; y[ 3] = a2; a2 = yi0 + m2; m2 = ralpha * xi0; y[11] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; #ifndef Conj_ a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; a3 += m3; m3 = ialpha * xxr1; #else a0 += m0; m0 = ralpha * xr1; a1 += m1; m1 = ralpha * xxr1; a2 -= m2; m2 = ialpha * xr1; a3 -= m3; m3 = ialpha * xxr1; #endif y[ 4] = a0; a0 = yr1 + m0; m0 = ialpha * xi1; y[12] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; y[ 5] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; y[13] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; #ifndef Conj_ a0 -= m0; a1 -= m1; a2 += m2; a3 += m3; #else a0 += m0; a1 += m1; a2 -= m2; a3 -= m3; #endif y[ 6] = a0; y[14] = a1; y[ 7] = a2; y[15] = a3;}#endif#ifdef Conj_void Mjoin(PATL,axpyConj_x1_y1)/* * y <- alpha * Conj(x) + y */#elsevoid Mjoin(PATL,axpy_x1_y1)/* * y <- alpha * x + y */#endif (const int N, const SCALAR alpha, const TYPE *X, const int incX, TYPE *Y, const int incY){#if defined(ATL_NOMULADD) && ATL_mmnreg >= 26 const int n8 = (N>>3)<<3, nr = N - n8; if (n8) { axpy_8(n8, alpha, X, Y); X += n8<<1; Y += n8<<1; } if (nr) axpy_lt8(nr, alpha, X, Y);#elif defined (ATL_MULADD) && ATL_mmnreg >= 26 axpy_8(N, alpha, X, Y);#endif}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -