ciamax_abs2_x1.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 210 行
C
210 行
#include "atlas_misc.h"#include <math.h>#if defined(ATL_AltiVec) && defined(SCPLX)#define ATL_NoFakePF#include "atlas_prefetch.h"static int ATL_amax_av(const int N, const TYPE *X, const int incX)/* * Assuming aligned X && N multiple of 4, finds abs max*/{ const TYPE *stX = X + N+N;#ifdef ATL_AVgcc const vector unsigned char vp0 = (vector unsigned char) {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; const vector unsigned char vp1 = (vector unsigned char) {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; vector float v0, v1, v2, v3, vmax = (vector float){0.0f, 0.0f, 0.0f, 0.0f};#else const vector unsigned char vp0 = (vector unsigned char) (0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27); const vector unsigned char vp1 = (vector unsigned char) (4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31); vector float v0, v1, v2, v3, vmax = (vector float)(0.0f, 0.0f, 0.0f, 0.0f);#endif const TYPE *xp=X, *x=X; char ch[128]; TYPE *tp; int i; register TYPE r0, r1; tp = ATL_AlignPtr((void*)ch); do { v0 = vec_ldl(0, X); v0 = vec_abs(v0); v1 = vec_ldl(0, X+4); X += 8; v1 = vec_abs(v1); v2 = vec_perm(v0, v1, vp0); v3 = vec_perm(v0, v1, vp1); v0 = vec_add(v2, v3); if (vec_all_ge(vmax, v0)) continue; vmax = vec_max(v0, vmax); v0 = vec_splat(vmax, 0); v1 = vec_splat(vmax, 1); v2 = vec_splat(vmax, 2); vmax = vec_splat(vmax, 3); v0 = vec_max(v0, v1); vmax = vec_max(v2, vmax); vmax = vec_max(v0, vmax); xp = X - 8; } while (X != stX); vec_st(vmax, 0, tp); r1 = *tp; for (i=0; i < 4; i++) { r0 = fabs(*xp) + fabs(xp[1]); if (r0 == r1) break; xp += 2; } if (i == 4) exit(-1); return(((int)(xp-x))>>1);}static int ATL_amax(const int N, const TYPE *X, const int incX){ const int incX2 = incX+incX; register TYPE xr, xi, yr, yi, xmax; int imax=0; const TYPE *stX = X + N+N, *xp=X, *x; int cwrd = ATL_MulBySize(N)>>4; switch(N) { case 1: return(0); case 2: xr = fabs(*X) + fabs(X[1]); xi = fabs(X[2]) + fabs(X[3]); if (xr >= xi) return(0); else return(1); case 3: xr = fabs(*X) + fabs(X[1]); xi = fabs(X[2]) + fabs(X[3]); yr = fabs(X[4]) + fabs(X[5]); if (xr >= xi && xr >= yr) return(0); else if (xi >= yr) return(1); else return(2); default:; } xr = *X; xi = X[1]; xmax = fabs(xr) + fabs(xi); if ((N>>1)<<1 == N) { xr = X[2]; xi = X[3]; xr = fabs(xr) + fabs(xi); if (xr > xmax) { xmax = xr; xp = X + 2; } x = X + 4; } else x = X + 2; if (N > 2) { do { ATL_pfl1R(x + 32); xr = *x; xi = x[1]; yr = x[2]; yi = x[3]; x += 4; xr = fabs(xr) + fabs(xi); yr = fabs(yr) + fabs(yi); if (xmax >= xr && xmax >= yr) continue; else if (xr >= yr) { xmax = xr; xp = x - 4; } else { xmax = yr; xp = x - 2; } } while(x != stX); } imax = (int) (xp - X); return(imax>>1);}static int GetMax(int I0, int I1, const TYPE *X)/* * Returns max of two maxes, assuming i0 is smaller index */{ register TYPE r0, r1; const int i0 = I0<<1, i1 = I1<<1; r0 = fabs(X[i0]) + fabs(X[i0+1]); r1 = fabs(X[i1]) + fabs(X[i1+1]); if (r0 >= r1) return(I0); else return(I1);}int ATL_UIAMAX(const int N, const TYPE *X, const int incX){ int cwrd = ATL_MulBySize(N)>>4; int im=0; int n0, n1, n2; if (N > 64) { if (cwrd >= 64) { cwrd = (cwrd+31)>>5; if (cwrd <= 256) cwrd = ATL_GetCtrl(512, cwrd <= 255 ? cwrd : 0, 0); else /* use all pipes */ { cwrd >>= 2; cwrd = ATL_GetCtrl(2048, cwrd <= 255 ? cwrd : 0, 0); ATL_pfavR(X+128, cwrd, 1); ATL_pfavR(X+256, cwrd, 2); ATL_pfavR(X+384, cwrd, 3); } } else cwrd = ATL_GetCtrl(64, (cwrd+3)>>2, 4); ATL_pfavR(X, cwrd, 0); n0 = ATL_AlignOffset(N, X, ATL_sizeof, ATL_MulBySize(4)); if (n0) im = ATL_amax(n0, X, incX); n1 = ((N - n0)>>2)<<2; n2 = N - n0 - n1; if (n1) im = GetMax(im, n0+ATL_amax_av(n1, X+n0+n0, incX), X); if (n2) im = GetMax(im, n0+n1+ATL_amax(n2, X+n0+n0+n1+n1, incX), X); } else im = ATL_amax(N, X, incX); return(im);}#elseint ATL_UIAMAX(const int N, const TYPE *X, const int incX){ const int incX2 = incX+incX; register TYPE xr, xi, yr, yi, xmax; int imax=0; const TYPE *stX = X + N+N, *xp=X, *x; if (N > 0) { xr = *X; xi = X[1]; xmax = fabs(xr) + fabs(xi); if ((N>>1)<<1 == N) { xr = X[2]; xi = X[3]; xr = fabs(xr) + fabs(xi); if (xr > xmax) { xmax = xr; xp = X + 2; } x = X + 4; } else x = X + 2; } if (N > 2) { do { xr = *x; xi = x[1]; yr = x[2]; yi = x[3]; x += 4; xr = fabs(xr) + fabs(xi); yr = fabs(yr) + fabs(yi); if (xmax >= xr && xmax >= yr) continue; else if (xr >= yr) { xmax = xr; xp = x - 4; } else { xmax = yr; xp = x - 2; } } while(x != stX); } imax = (int) (xp - X); return(imax>>1);}#endif
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?