axpy16p4x16_x1y1.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 172 行
C
172 行
#include "atlas_misc.h"static void axpyCU(const int N, const SCALAR alpha0, const TYPE *X, TYPE *Y){ const TYPE *stX; int nr = N; register TYPE alpha=alpha0; if (nr >= 16) { *Y += alpha * *X; Y[1] += alpha * X[1]; Y[2] += alpha * X[2]; Y[3] += alpha * X[3]; Y[4] += alpha * X[4]; Y[5] += alpha * X[5]; Y[6] += alpha * X[6]; Y[7] += alpha * X[7]; Y[8] += alpha * X[8]; Y[9] += alpha * X[9]; Y[10] += alpha * X[10]; Y[11] += alpha * X[11]; Y[12] += alpha * X[12]; Y[13] += alpha * X[13]; Y[14] += alpha * X[14]; Y[15] += alpha * X[15]; X += 16; Y += 16; nr -= 16; } if (nr >= 8) { *Y += alpha * *X; Y[1] += alpha * X[1]; Y[2] += alpha * X[2]; Y[3] += alpha * X[3]; Y[4] += alpha * X[4]; Y[5] += alpha * X[5]; Y[6] += alpha * X[6]; Y[7] += alpha * X[7]; X += 8; Y += 8; nr -= 8; } if (nr >= 4) { *Y += alpha * *X; Y[1] += alpha * X[1]; Y[2] += alpha * X[2]; Y[3] += alpha * X[3]; X += 4; Y += 4; nr -= 4; } if (nr >= 2) { *Y += alpha * *X; Y[1] += alpha * X[1]; X += 2; Y += 2; nr -= 2; } if (nr >= 1) { *Y += alpha * *X; }}#include "atlas_misc.h"void ATL_UAXPY(const int N, const SCALAR alpha, const TYPE *x, const int incX, TYPE *y, const int incY)/* * 4 register prefetch on X (assumed to be in L1), 16 register prefetch on * Y (L2 or main), with 8-cycle muladd. Unrolled by 16 to ensure multiple * cacheline usage for both single and double. This kernel may blow for * axpy, but it'll rock for GER. */{ const int N16 = (N>>4)<<4; int i, j; const TYPE *stX = x + N16 - 32; const register TYPE alp = alpha; register TYPE m0, m1, m2, m3, m4, m5, m6, m7; register TYPE x0, x1, xx0, xx1; register TYPE y0, y1, y2, y3, y4, y5, y6, y7; register TYPE yy0, yy1, yy2, yy3, yy4, yy5, yy6, yy7; if (N16 > 16) { x0 = *x; xx0 = x[8]; x1 = x[1]; xx1 = x[9]; y0 = *y; yy0 = y[8]; y1 = y[1]; yy1 = y[9]; y2 = y[2]; yy2 = y[10]; y3 = y[3]; yy3 = y[11]; y4 = y[4]; yy4 = y[12]; y5 = y[5]; yy5 = y[13]; y6 = y[6]; yy6 = y[14]; y7 = y[7]; yy7 = y[15]; m0 = y0 + alp * x0; x0 = x[2]; y0 = y[16]; m1 = yy0 + alp * xx0; xx0 = x[10]; yy0 = y[24]; m2 = y1 + alp * x1; x1 = x[3]; y1 = y[17]; m3 = yy1 + alp * xx1; xx1 = x[11]; yy1 = y[25]; m4 = y2 + alp * x0; x0 = x[4]; y2 = y[18]; m5 = yy2 + alp * xx0; xx0 = x[12]; yy2 = y[26]; m6 = y3 + alp * x1; x1 = x[5]; y3 = y[19]; m7 = yy3 + alp * xx1; xx1 = x[13]; yy3 = y[27]; if (N16 != 32) { do { *y = m0; m0 = y4 + alp * x0; x0 = x[ 6]; y4 = y[20]; y[ 8] = m1; m1 = yy4 + alp * xx0; xx0 = x[14]; yy4 = y[28]; y[ 1] = m2; m2 = y5 + alp * x1; x1 = x[ 7]; y5 = y[21]; y[ 9] = m3; m3 = yy5 + alp * xx1; xx1 = x[15]; yy5 = y[29]; x += 16; y[ 2] = m4; m4 = y6 + alp * x0; x0 = *x; y6 = y[22]; y[10] = m5; m5 = yy6 + alp * xx0; xx0 = x[ 8]; yy6 = y[30]; y[ 3] = m6; m6 = y7 + alp * x1; x1 = x[ 1]; y7 = y[23]; y[11] = m7; m7 = yy7 + alp * xx1; xx1 = x[ 9]; yy7 = y[31]; y[ 4] = m0; m0 = y0 + alp * x0; x0 = x[2]; y0 = y[32]; y[12] = m1; m1 = yy0 + alp * xx0; xx0 = x[10]; yy0 = y[40]; y[ 5] = m2; m2 = y1 + alp * x1; x1 = x[3]; y1 = y[33]; y[13] = m3; m3 = yy1 + alp * xx1; xx1 = x[11]; yy1 = y[41]; y[ 6] = m4; m4 = y2 + alp * x0; x0 = x[4]; y2 = y[34]; y[14] = m5; m5 = yy2 + alp * xx0; xx0 = x[12]; yy2 = y[42]; y[ 7] = m6; m6 = y3 + alp * x1; x1 = x[5]; y3 = y[35]; y[15] = m7; m7 = yy3 + alp * xx1; xx1 = x[13]; yy3 = y[43]; y += 16; } while (x != stX); } *y = m0; m0 = y4 + alp * x0; x0 = x[ 6]; y4 = y[20]; y[ 8] = m1; m1 = yy4 + alp * xx0; xx0 = x[14]; yy4 = y[28]; y[ 1] = m2; m2 = y5 + alp * x1; x1 = x[ 7]; y5 = y[21]; y[ 9] = m3; m3 = yy5 + alp * xx1; xx1 = x[15]; yy5 = y[29]; x += 16; y[ 2] = m4; m4 = y6 + alp * x0; x0 = *x; y6 = y[22]; y[10] = m5; m5 = yy6 + alp * xx0; xx0 = x[ 8]; yy6 = y[30]; y[ 3] = m6; m6 = y7 + alp * x1; x1 = x[ 1]; y7 = y[23]; y[11] = m7; m7 = yy7 + alp * xx1; xx1 = x[ 9]; yy7 = y[31]; y[ 4] = m0; m0 = y0 + alp * x0; x0 = x[2]; y[12] = m1; m1 = yy0 + alp * xx0; xx0 = x[10]; y[ 5] = m2; m2 = y1 + alp * x1; x1 = x[3]; y[13] = m3; m3 = yy1 + alp * xx1; xx1 = x[11]; y[ 6] = m4; m4 = y2 + alp * x0; x0 = x[4]; y[14] = m5; m5 = yy2 + alp * xx0; xx0 = x[12]; y[ 7] = m6; m6 = y3 + alp * x1; x1 = x[5]; y[15] = m7; m7 = yy3 + alp * xx1; xx1 = x[13]; y += 16; *y = m0; m0 = y4 + alp * x0; x0 = x[ 6]; y[ 8] = m1; m1 = yy4 + alp * xx0; xx0 = x[14]; y[ 1] = m2; m2 = y5 + alp * x1; x1 = x[ 7]; y[ 9] = m3; m3 = yy5 + alp * xx1; xx1 = x[15]; x += 16; y[ 2] = m4; m4 = y6 + alp * x0; y[10] = m5; m5 = yy6 + alp * xx0; y[ 3] = m6; m6 = y7 + alp * x1; y[11] = m7; m7 = yy7 + alp * xx1; y[ 4] = m0; y[12] = m1; y[ 5] = m2; y[13] = m3; y[ 6] = m4; y[14] = m5; y[ 7] = m6; y[15] = m7; y += 16; axpyCU(N-N16, alpha, x, y); } else axpyCU(N, alpha, x, y);}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?