caxpy8p4m0_x1y1.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 209 行
C
209 行
#include "atlas_misc.h"static void axpyCU(const int N, const SCALAR alpha, const TYPE *x, TYPE *y)/* * For cleanup, see if we can get compiler to do the work, use constant loops */{ int i; const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE xr, xi; switch(N) { case 1: xr = *x; xi = x[1]; *y += ralpha * xr - ialpha * xi; y[1] += ialpha * xr + ralpha * xi; break; case 2: for (i=2; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } break; case 3: for (i=3; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } break; case 4: for (i=4; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } break; case 5: for (i=5; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } break; case 6: for (i=6; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } break; case 7: for (i=7; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } break; default: for (i=N; i; i--, x += 2, y += 2) { xr = *x; xi = x[1]; *y += ralpha * xr - ialpha *xi; y[1] += ialpha * xr + ralpha *xi; } }}static void axpy_8(const int N, const SCALAR alpha, const TYPE *x, TYPE *y)/* * 8 register prefetch on X & Y, with 4 cycle multiply & 4 cycle add, * unrolled by 16 to ensure multiple cacheline usage for both singe & double */{ const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE xr0, xi0, xr1, xi1, xxr0, xxi0, xxr1, xxi1; register TYPE yr0, yi0, yr1, yi1, yyr0, yyi0, yyr1, yyi1; register TYPE m0, m1, m2, m3, a0, a1, a2, a3; const TYPE *stX = x + (N<<1) - 16;/* ATL_assert( (N == (N>>3)<<3) && N ); */ xr0 = *x; xxr0 = x[8]; xi0 = x[1]; xxi0 = x[9]; xr1 = x[2]; xxr1 = x[10]; xi1 = x[3]; xxi1 = x[11]; yr0 = *y; yyr0 = y[8]; yi0 = y[1]; yyi0 = y[9]; yr1 = y[2]; yyr1 = y[10]; yi1 = y[3]; yyi1 = y[11]; m0 = ralpha * xr0; m1 = ralpha * xxr0; m2 = ialpha * xr0; xr0 = x[4]; m3 = ialpha *xxr0; xxr0 = x[12]; a0 = yr0 + m0; m0 = ialpha * xi0; yr0 = y[4]; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[12]; a2 = yi0 + m2; m2 = ralpha * xi0; xi0 = x[5]; yi0 = y[5]; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[13]; yyi0 = y[13]; a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; xr1 = x[6]; a3 += m3; m3 = ialpha * xxr1; xxr1 = x[14]; if (N != 8) { do { *y = a0; a0 = yr1 + m0; m0 = ialpha * xi1; yr1 = y[6]; y[8] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[14]; y[1] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; xi1 = x[7]; yi1 = y[7]; y[9] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[15]; yyi1 = y[15]; x += 16; a0 -= m0; m0 = ralpha * xr0; a1 -= m1; m1 = ralpha * xxr0; a2 += m2; m2 = ialpha * xr0; xr0 = *x; a3 += m3; m3 = ialpha * xxr0; xxr0 = x[8]; y[ 2] = a0; a0 = yr0 + m0; m0 = ialpha * xi0; yr0 = y[16]; y[10] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[24]; y[ 3] = a2; a2 = yi0 + m2; m2 = ralpha * xi0; xi0 = x[1]; yi0 = y[17]; y[11] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[9]; yyi0 = y[25]; a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; xr1 = x[2]; a3 += m3; m3 = ialpha * xxr1; xxr1 = x[10]; y[ 4] = a0; a0 = yr1 + m0; m0 = ialpha * xi1; yr1 = y[18]; y[12] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[26]; y[ 5] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; xi1 = x[3]; yi1 = y[19]; y[13] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[11]; yyi1 = y[27]; a0 -= m0; m0 = ralpha * xr0; a1 -= m1; m1 = ralpha * xxr0; a2 += m2; m2 = ialpha * xr0; xr0 = x[4]; a3 += m3; m3 = ialpha * xxr0; xxr0 = x[12]; y[ 6] = a0; a0 = yr0 + m0; m0 = ialpha * xi0; yr0 = y[20]; y[14] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; yyr0 = y[28]; y[ 7] = a2; a2 = yi0 + m2; m2 = ralpha * xi0; xi0 = x[5]; yi0 = y[21]; y[15] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; xxi0 = x[13]; yyi0 = y[29]; y += 16; a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; xr1 = x[6]; a3 += m3; m3 = ialpha * xxr1; xxr1 = x[14]; } while (x != stX); }/* * Drain pipe, store last 8 elts of Y */ *y = a0; a0 = yr1 + m0; m0 = ialpha * xi1; yr1 = y[6]; y[8] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; yyr1 = y[14]; y[1] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; xi1 = x[7]; yi1 = y[7]; y[9] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; xxi1 = x[15]; yyi1 = y[15]; a0 -= m0; m0 = ralpha * xr0; a1 -= m1; m1 = ralpha * xxr0; a2 += m2; m2 = ialpha * xr0; a3 += m3; m3 = ialpha * xxr0; y[ 2] = a0; a0 = yr0 + m0; m0 = ialpha * xi0; y[10] = a1; a1 = yyr0 + m1; m1 = ialpha * xxi0; y[ 3] = a2; a2 = yi0 + m2; m2 = ralpha * xi0; y[11] = a3; a3 = yyi0 + m3; m3 = ralpha * xxi0; a0 -= m0; m0 = ralpha * xr1; a1 -= m1; m1 = ralpha * xxr1; a2 += m2; m2 = ialpha * xr1; a3 += m3; m3 = ialpha * xxr1; y[ 4] = a0; a0 = yr1 + m0; m0 = ialpha * xi1; y[12] = a1; a1 = yyr1 + m1; m1 = ialpha * xxi1; y[ 5] = a2; a2 = yi1 + m2; m2 = ralpha * xi1; y[13] = a3; a3 = yyi1 + m3; m3 = ralpha * xxi1; a0 -= m0; a1 -= m1; a2 += m2; a3 += m3; y[ 6] = a0; y[14] = a1; y[ 7] = a2; y[15] = a3;}void ATL_UAXPY(const int N, const SCALAR alpha, const TYPE *X, const int incX, TYPE *Y, const int incY){ const int n8 = (N>>3)<<3, nr = N - n8; if (n8) { axpy_8(n8, alpha, X, Y); X += n8<<1; Y += n8<<1; } if (nr) axpyCU(nr, alpha, X, Y);}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?