📄 atl_gemv_sse.c
字号:
pm(3,2) \ pa(2,6) \ plb(SS(a_,MM(1,RS4)),bx,3) \ plq(SS(a_,MM(1,RS4)),ax,0) \ plqx(SS(a_,MM(1,RS4)),ax,bp,1,1) \ plqx(SS(a_,MM(1,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6)#undef p4_gemvT_3_1#define p4_gemvT_3_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,3) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plqx(SS(a_,MM(0,RS4)),ax,bp,1,1) \ plqx(SS(a_,MM(0,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6) \ plb(SS(a_,MM(1,RS4)),bx,3) \ plq(SS(a_,MM(1,RS4)),ax,0) \ pfx(nta,SS(a_,MM((SS(0,CL)),RS4)),ax,bp,1) \ plqx(SS(a_,MM(1,RS4)),ax,bp,1,1) \ plqx(SS(a_,MM(1,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6) \ plb(SS(a_,MM(2,RS4)),bx,3) \ plq(SS(a_,MM(2,RS4)),ax,0) \ plqx(SS(a_,MM(2,RS4)),ax,bp,1,1) \ pfx(nta,SS(a_,MM((SS(0,CL)),RS4)),ax,bp,2) \ plqx(SS(a_,MM(2,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6) \ plb(SS(a_,MM(3,RS4)),bx,3) \ plq(SS(a_,MM(3,RS4)),ax,0) \ plqx(SS(a_,MM(3,RS4)),ax,bp,1,1) \ plqx(SS(a_,MM(3,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6)#undef lpgemvT_3_1#define lpgemvT_3_1(a_)#undef dpgemvT_3_1#define dpgemvT_3_1(a_) p4_gemvT_3_1(a_)#undef plgemvT_3_1#define plgemvT_3_1 16#undef p1_4_gemvT_1_2#define p1_4_gemvT_1_2(a_) \ pls(SS(a_,MM(0,RS4)),bx,4) \ pls(SS(a_,MM(0,RS4)),ax,0) \ pmsr(4,0) \ pasr(0,6)#undef p1_2_gemvT_1_2#define p1_2_gemvT_1_2(a_) \ px(4) \ pld(SS(a_,MM(0,RS4)),bx,4) \ px(0) \ pld(SS(a_,MM(0,RS4)),ax,0) \ pm(4,0) \ pa(0,6)#undef p1_gemvT_1_2#define p1_gemvT_1_2(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plq(SS(a_,MM(0,RS4)),ax,0) \ pm(4,0) \ pa(0,6)#undef p2_gemvT_1_2#define p2_gemvT_1_2(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pm(4,0) \ pa(0,6) \ pm(5,1) \ pa(1,6)#undef p4_gemvT_1_2#define p4_gemvT_1_2(a_) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pm(0,4) \ pa(4,6) \ plb(SS(a_,MM(2,RS4)),bx,4) \ plq(SS(a_,MM(2,RS4)),ax,0) \ pm(1,5) \ pa(5,6) \ plb(SS(a_,MM(3,RS4)),bx,5) \ plq(SS(a_,MM(3,RS4)),ax,1) \ pm(0,4) \ pa(4,6) \ plb(SS(a_,MM(4,RS4)),bx,4) \ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ plq(SS(a_,MM(4,RS4)),ax,0) \ pm(1,5) \ pa(5,6)#undef lpgemvT_1_2#define lpgemvT_1_2(a_) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plq(SS(a_,MM(0,RS4)),ax,0)#undef dpgemvT_1_2#define dpgemvT_1_2(a_) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pm(0,4) \ pa(4,6) \ plb(SS(a_,MM(2,RS4)),bx,4) \ plq(SS(a_,MM(2,RS4)),ax,0) \ pm(1,5) \ pa(5,6) \ plb(SS(a_,MM(3,RS4)),bx,5) \ plq(SS(a_,MM(3,RS4)),ax,1) \ pm(0,4) \ pa(4,6) \ pm(1,5) \ pa(5,6)#undef plgemvT_1_2#define plgemvT_1_2 16/* #define BITS 8 *//* #define CL 56 */#define NDPM 1#define BITS 8#if defined(SREAL) || defined(DREAL)#define CL 56#else#define CL 32#endif/* #include "out.h" *//* #include "foo.h" */#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1))))#undef MY_FUNCTION#define MY_FUNCTION FNvoidMY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a, int lda,const TYPE *b,int binc, const SCALAR beta,TYPE *c,int cinc) { NO_INLINE int ks;#ifdef GCCWIN void *freeme[4]={NULL,NULL,NULL,NULL};#endif const TYPE *bs[4]={NULL,NULL,NULL,NULL}; const TYPE *ds[5]={NULL,NULL,NULL,NULL,NULL};#undef AL#define AL(a_) (((unsigned long)(a_))&0xf) const TYPE *at; long j; int k; at=a; for (k=0;k<4;at+=lda,k++) {#ifdef REAL int l=AL(at)/4;#else int l=0;#endif if (bs[l]) continue;#ifdef COPY_B if (l!=AL(b)/4) { #ifndef GCCWIN bs[l]=alloca(n*sizeof(*b)+15); #else freeme[l] = bs[l]=malloc(n*sizeof(*b)+15); #endif ATL_assert(bs[l]); j=4*l-AL(bs[l]); j=j<0? j+16 : j; bs[l]=(void *)bs[l]+j; Mjoin(ATL_,Mjoin(PREC,copy))(n,(void *)b,1,(void *)bs[l],1);#ifdef MDEBUG printf("Allocing: a %p b %p X0 %p %d %d %d j %d %f %f\n", at,b,bs[l],AL(at),AL(b),AL(bs[l]),j,b[0],*(TYPE *)(bs[l]));#endif } else#endif bs[l]=b; }#if !defined(REAL) if (!bs[AL(a)/4]) bs[AL(a)/4]=bs[0];#if defined(SINGLE) if (!bs[AL(a+lda)/4]) bs[AL(a+lda)/4]=bs[0];#endif#endif#undef N#define N main ds[0]=a+m*lda; ds[1]=(const TYPE *)(lda*sizeof(*a)); ds[2]=(const TYPE *)sizeof(*c);#if defined(BETAX) || defined(BETAXI0)#if defined(REAL) ds[3]=β#else ds[3]=beta;#endif#endif#if !defined(REAL) ds[4]=signd;#endif#ifdef MDEBUG printf("bs is %p %p %p %p %p\n",bs,bs[0],bs[1],bs[2],bs[3]); printf("ds is %p %p %p %p %p\n",ds,ds[0],ds[1],ds[2],ds[3],ds[4]); printf("b is %p\n",b);#endif ASM ( "pushl %%ebx\n\t" "movl %%esi,%%ebx\n\t" align lab(a_loop) icmpr(ax,di) je(a_end) push(ax) push(bx) push(dx) "movl %%eax,%%esi\n\t" "and $0xf,%%esi\n\t" "movl (%%ebx,%%esi,1),%%ebx\n\t" px(6) px(7)#undef VERS#if defined(REAL)#define VERS 3#else#define VERS 1c#endif#undef N#define N Mjoin(Mjoin(gemvT_,1_),VERS)#define ALIGN#undef INC#define INC(a_) a(a_,ax) a(a_,bx)#undef LR#define LR dx#include "camm_tpipe.h"#undef N#define N main#ifndef BETA0 f(nta,0,cx)#endif#if defined(REAL)#if VERS == 3 pa(7,6)#endif#else "movl 16(%%edi),%%esi\n\t" pl(0,si,1)#ifdef Conj_ pm(1,7)#else pm(1,6)#endif pc(6,5) pul(7,6) puh(7,5) pa(5,6)#endif#if defined(SINGLE) px(5) phl(6,5) pa(5,6)#endif#if defined(REAL) pc(6,5) ps(1,6,6) pasr(5,6)#endif#if !defined(BETA0)#if defined(REAL) pls(0,cx,0)#elif defined(SINGLE) px(0) pld(0,cx,0)#else pl(0,cx,0)#endif#if defined(BETAX) || defined(BETAXI0) "movl 12(%%edi),%%esi\n\t"#if defined(REAL) || defined(BETAXI0) pls(0,si,5)#elif defined(SINGLE) px(5) pld(0,si,5)#else pl(0,si,5)#endif#if defined(REAL) pmsr(5,0)#elif defined(BETAXI0) ps(0,5,5) pm(5,0)#else pc(0,2) ps(CSHUF,0,0) pm(5,2) pm(5,0) pm(1,2)#if defined(SINGLE) pul(0,2) phl(2,0)#else pc(0,1) pc(2,0) pul(1,2) puh(1,0)#endif#endif#endif#if defined(REAL) pasr(0,6)#else pa(0,6)#if defined(BETAX) && !defined(REAL) pa(2,6)#endif#endif#endif#if defined(REAL) pus(6,0,cx)#elif defined(SINGLE) pud(6,0,cx)#else pu(6,0,cx)#endif pop(dx) pop(bx) pop(ax) "addl 4(%%edi),%%eax\n\t" "addl 8(%%edi),%%ecx\n\t" jmp(a_loop) lab(a_end) "movl %%ebx,%%esi\n\t" "popl %%ebx\n\t" ::"a" (a),"S" (bs),"c" (c),"d" (n*DIV),"D" (ds) :"memory"); #ifdef GCCWIN for (k=0; k < 4; k++) if (freeme[k]) free(freeme[k]); #endif#ifdef MDEBUG printf("bs is %p %p %p %p %p\n",bs,bs[0],bs[1],bs[2],bs[3]); printf("ds is %p %p %p %p %p\n",ds,ds[0],ds[1],ds[2],ds[3],ds[4]); printf("b is %p\n",b);#endif}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -