📄 camm_dpa.h
字号:
#undef dp#define dp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ fm(sri(c_),0) fap(0,tri(c_))\ fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))#undef dpp#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\ fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))#endif#else#undef plaa#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax#undef wa#define wa(a_)#undef ddprr#define ddprr(a_,b_,c_) fl(a_ ## 0,b_) \ fd(tri(c_)) fm(P(sri(c_),1),0) fap(0,1) \ fd(M(trr(c_),1)) fm(srr(c_),0) fspi(0,1) \ fp(a_ ## 0,b_) #undef ddpri#define ddpri(a_,b_,c_) fl(a_ ## 8,b_) \ fd(tii(c_)) fm(P(sii(c_),1),0) fap(0,1) \ fd(M(tir(c_),1)) fm(sir(c_),0) fapi(0,1) \ fp(a_ ## 8,b_) #undef dpri#define dpri(a_,b_,c_) fl(a_ ## 8,b_) \ fx(2) fm(sir(c_),0) fap(0,2) \ fm(M(sii(c_),2),0) fapi(0,1) \ fp(a_ ## 8,b_)#undef ddpp#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_)#undef ddp#define ddp(a_,b_,c_) ddprr(a_,b_,c_) ddpri(a_,b_,c_)#undef dpp#define dpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_)#undef dp#define dp(a_,b_,c_) ddprr(a_,b_,c_) dpri(a_,b_,c_)#endif#undef R1#define R1 4#undef R2#define R2 6#undef R3#define R3 6#undef R4#define R4 6#endif#endif/****************************************************************************** * General Macros ******************************************************************************/ #undef bla1#define bla1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) #undef blb1#define blb1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_) #undef bla2#undef bla2#define bla2(a_,b_) pf(b_,si) plaa(a_) ddp(a_,ax,R1) pf(b_,ax) dp(a_,bx,R2) wa(a_)#undef blb2#undef blb2#define blb2(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) dp(a_,bx,R2) wa(a_) #undef bla3#define bla3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \ dpp(a_,cx,R3,b_,ax) wa(a_)#undef blb3#define blb3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \ dpp(a_,cx,R3,b_,cx) wa(a_) #undef bla4#define bla4(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \ ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_)#undef blb4#define blb4(a_,b_) plaa(a_) ddp(a_,ax,R1) ddpp(a_,bx,R2,b_,cx) \ ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_)#undef bla#define bla(a_,b_) Mjoin(bla,NDP)(a_,b_)#undef blb#define blb(a_,b_) Mjoin(blb,NDP)(a_,b_)#undef bla11_2#define bla11_2(a_) plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) #undef bla21_2#define bla21_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_)#undef bla31_2#define bla31_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ dp1_2(a_,cx,R3) wa1_2(a_)#undef bla41_2#define bla41_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_)#undef bla1_2#define bla1_2(a_) Mjoin(Mjoin(bla,NDP),1_2)(a_)#undef bla11_4#define bla11_4(a_) plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) #undef bla21_4#define bla21_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_)#undef bla31_4#define bla31_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ dp1_4(a_,cx,R3) wa1_4(a_)#undef bla41_4#define bla41_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_)#undef bla1_4#define bla1_4(a_) Mjoin(Mjoin(bla,NDP),1_4)(a_)#undef inc1#define inc1(a_) a(a_,si) a(a_,ax)#undef inc2#define inc2(a_) inc1(a_) a(a_,bx)#undef inc3#define inc3(a_) inc2(a_) a(a_,cx)#undef inc4#define inc4(a_) inc3(a_) a(a_,dx)#undef inc#define inc(a_) Mjoin(inc,NDP)(a_)#ifdef PREFETCH/* #include "camm_arith.h" */#undef S#define S(a_,b_) (a_) + (b_)#undef PF1#define PF1 PREFETCH#undef PF2#define PF2 S(PF1,32)#undef PF3#define PF3 S(PF1,64)#undef PF4#define PF4 S(PF1,96)#undef PF5#define PF5 S(PF1,128)#undef PF6#define PF6 S(PF1,160)#undef PF7#define PF7 S(PF1,192)#undef PF8#define PF8 S(PF1,224)#else#undef PF1#define PF1 64#undef PF2#define PF2 96#undef PF3#define PF3 128#undef PF4#define PF4 160#undef PF5#define PF5 192#undef PF6#define PF6 224#undef PF7#define PF7 256#undef PF8#define PF8 288#endif#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER)#undef pf#define pf(a_,b_) f(t0,a_,b_)#else#undef pf#define pf(a_,b_) f(nta,a_,b_)#endif#undef bl1#define bl1 bla1_4(0x0) inc(4)#undef bl2#define bl2 bla1_2(0x0) inc(8)#undef bl4#define bl4 bla(0x0,PF1) inc(16)#undef bl8#define bl8 bla(0x0,PF1) blb(0x1,PF1) inc(32) #undef bl16#define bl16 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64)#undef bl32#define bl32 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128)#undef bl64#define bl64 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \ bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \ bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256)/* #define in2 inc(8) *//* #define in4 inc(16) *//* #define in8 inc(32) *//* #define in16 inc(64) */#undef in2#define in2 #undef in4#define in4 #undef in8#define in8 #undef in16#define in16 #ifdef NO_TRANSPOSE#undef incf#define incf ra(di,si)#else#undef incf#define incf#endif#undef lf1#define lf1 mpx(R1)#undef lf2#define lf2 lf1 incf mpx(R2)#undef lf3#define lf3 lf2 incf mpx(R3)#undef lf4#define lf4 lf3 incf mpx(R4)#undef lf#define lf Mjoin(lf,NDP)#undef ulf1#define ulf1 ulfa(R1)#undef ulf2#define ulf2 ulf1 ra(di,si) ulfa(R2) #undef ulf3#define ulf3 ulf2 ra(di,si) ulfa(R3) #undef ulf4#define ulf4 ulf3 ra(di,si) ulfa(R4) #undef ulf#define ulf Mjoin(ulf,NDP)#undef lpba#define lpba(a_) "movl %%esi,%%e" #a_ "\n\t"#undef lpb1#define lpb1 lpba(ax)#undef lpb2#define lpb2 lpb1 ra(di,si) lpba(bx)#undef lpb3#define lpb3 lpb2 ra(di,si) lpba(cx)#undef lpb4#define lpb4 lpb3 ra(di,si) lpba(dx)#undef lpb#define lpb Mjoin(lpb,NDP)#undef ipf1#define ipf1(a_) pf(a_,si) pf(a_,ax)#undef ipf2#define ipf2(a_) ipf1(a_) pf(a_,bx) #undef ipf3#define ipf3(a_) ipf2(a_) pf(a_,cx) #undef ipf4#define ipf4(a_) ipf3(a_) pf(a_,dx) #undef ipf#define ipf(a_) Mjoin(ipf,NDP)(a_)#ifdef LUNROLL#undef UNROLL#ifdef SREAL#undef UNROLL#define UNROLL LUNROLL#elif defined(DREAL) || defined(SCPLX)#undef UNROLL#define UNROLL LUNROLL*2#elif defined(DCPLX)#undef UNROLL#define UNROLL LUNROLL*4#endif#else#undef UNROLL#define UNROLL 16#endif#undef UNROLL1_2#if UNROLL == 64#undef blUNROLL#define blUNROLL bl64#undef UNROLL1_2#define UNROLL1_2 32#elif UNROLL == 32#undef blUNROLL#define blUNROLL bl32#undef UNROLL1_2#define UNROLL1_2 16#elif UNROLL == 16#undef blUNROLL#define blUNROLL bl16#undef UNROLL1_2#define UNROLL1_2 8#elif UNROLL == 8#undef blUNROLL#define blUNROLL bl8#undef UNROLL1_2#define UNROLL1_2 4#elif UNROLL == 4#undef blUNROLL#define blUNROLL bl4#undef UNROLL1_2#define UNROLL1_2 2#elif UNROLL == 2#undef blUNROLL#define blUNROLL bl2#undef UNROLL1_2#define UNROLL1_2 1#elif UNROLL == 1#undef blUNROLL#define blUNROLL bl1#undef UNROLL1_2#define UNROLL1_2 stop#endif#ifndef UNROLL1_2#error UNROLL must be set to power of 2 < 128#endif#ifdef GER#undef aconst#define aconst#undef cconst#define cconst const#else#undef aconst#define aconst const#undef cconst#define cconst#endif#undef MY_FUNCTION#define MY_FUNCTION Mjoin(dp,EXT)static voidMY_FUNCTION(aconst TYPE *a,int lda, const TYPE *b, cconst TYPE *c,int stride,int len) {#ifdef SCPLX#if defined(GER) && defined(Conj_) const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1;#else const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1;#endif#endif#if defined(DCPLX) && defined(ATL_SSE2)#if defined(GER) && defined(Conj_) const TYPE w1[1]={{-1.0,1.0}},*w=w1;#else const TYPE w1[1]={{1.0,-1.0}},*w=w1;#endif#endif#ifdef NO_TRANSPOSE#undef movm#define movm c#undef fixm#define fixm b#else#undef movm#define movm b#undef fixm#define fixm c#endif NO_INLINE unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float); ASM ( "pushl %%ebx\n\t" a(4,sp)#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) "movl %6,%%esi\n\t" pl(0,si,SREG)#endif #ifdef NO_TRANSPOSE "movl %1,%%esi\n\t" /* fixm */ "movl %2,%%edi\n\t" /* fixm2fixm */#endif lf "movl %3,%%esi\n\t" /* a */ "movl %4,%%edi\n\t" /* a2a */ lpb ipf(0) "movl %0,%%esi\n\t" /* movm */ "movl %5,%%edi\n\t" /* len */#if defined(ALIGN)#if defined(SREAL) test(4,ax) je(Mjoin(a1,EXT)) test(-1,di) je(Mjoin(a1,EXT)) sub(1,di) bl1 lab(Mjoin(a1,EXT))#endif#if defined(DREAL) || defined(SREAL) test(8,ax) je(Mjoin(as,EXT)) test(-2,di) je(Mjoin(as,EXT)) sub(2,di) bl2 lab(Mjoin(as,EXT))#endif#endif ipf(32) lab(Mjoin(loop,EXT)) test(-UNROLL,di) je(Mjoin(UNROLL1_2,EXT)) sub(UNROLL,di) blUNROLL jmp(Mjoin(loop,EXT))#if UNROLL > 32 lab(Mjoin(32,EXT)) test(32,di) je(Mjoin(16,EXT)) bl32#endif #if UNROLL > 16 lab(Mjoin(16,EXT)) test(16,di) je(Mjoin(8,EXT)) bl16#endif #if UNROLL > 8 lab(Mjoin(8,EXT)) test(8,di) je(Mjoin(4,EXT)) bl8#endif #if UNROLL > 4 lab(Mjoin(4,EXT)) test(4,di) je(Mjoin(2,EXT)) bl4#endif#if UNROLL > 2 lab(Mjoin(2,EXT))#ifndef DCPLX test(2,di) je(Mjoin(1,EXT)) bl2#endif#endif#if UNROLL > 1 lab(Mjoin(1,EXT))#ifdef SREAL test(1,di) je(Mjoin(stop,EXT)) bl1#endif#endif lab(Mjoin(stop,EXT))#ifndef NO_TRANSPOSE "movl %1,%%esi\n\t" /* fixm */ "movl %2,%%edi\n\t" /* fixm2fixm */#endif ulf a(-4,sp) "popl %%ebx\n\t" ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3)#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) ,"m" (w)#endif :"ax","bx","cx","dx","si","di");}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -