⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 camm_dpa.h

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 H
📖 第 1 页 / 共 3 页
字号:
#undef dp#define dp(a_,b_,c_)        fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \                                                 fm(sri(c_),0) fap(0,tri(c_))\                            fl(a_ ## 8,b_)       fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \                                                 fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))#undef dpp#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \                                                 pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\                            fl(a_ ## 8,b_)       fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \                                                 fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))#endif#else#undef plaa#define plaa(a_)            fl(a_ ## 0,si) fl(a_ ## 8,si) plax#undef wa#define wa(a_)#undef ddprr#define ddprr(a_,b_,c_)     fl(a_ ## 0,b_) \                                              fd(tri(c_))           fm(P(sri(c_),1),0)      fap(0,1) \                                              fd(M(trr(c_),1))      fm(srr(c_),0)           fspi(0,1) \                            fp(a_ ## 0,b_) #undef ddpri#define ddpri(a_,b_,c_)     fl(a_ ## 8,b_) \                                              fd(tii(c_))           fm(P(sii(c_),1),0)      fap(0,1) \                                              fd(M(tir(c_),1))      fm(sir(c_),0)           fapi(0,1) \                            fp(a_ ## 8,b_) #undef dpri#define dpri(a_,b_,c_)      fl(a_ ## 8,b_) \                                              fx(2)                 fm(sir(c_),0)           fap(0,2) \                                                                    fm(M(sii(c_),2),0)      fapi(0,1) \                            fp(a_ ## 8,b_)#undef ddpp#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_)#undef ddp#define ddp(a_,b_,c_)        ddprr(a_,b_,c_)           ddpri(a_,b_,c_)#undef dpp#define dpp(a_,b_,c_,d_,e_)  ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_)#undef dp#define dp(a_,b_,c_)         ddprr(a_,b_,c_)           dpri(a_,b_,c_)#endif#undef R1#define R1 4#undef R2#define R2 6#undef R3#define R3 6#undef R4#define R4 6#endif#endif/****************************************************************************** *  General Macros ******************************************************************************/  #undef bla1#define bla1(a_,b_)          plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) #undef blb1#define blb1(a_,b_)          plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_)			     #undef bla2#undef bla2#define bla2(a_,b_)          pf(b_,si) plaa(a_) ddp(a_,ax,R1)        pf(b_,ax) dp(a_,bx,R2) wa(a_)#undef blb2#undef blb2#define blb2(a_,b_)                    plaa(a_) ddpp(a_,ax,R1,b_,bx)           dp(a_,bx,R2) wa(a_) 			     #undef bla3#define bla3(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \                             dpp(a_,cx,R3,b_,ax) wa(a_)#undef blb3#define blb3(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \                             dpp(a_,cx,R3,b_,cx) wa(a_)			     #undef bla4#define bla4(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \                             ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_)#undef blb4#define blb4(a_,b_)          plaa(a_) ddp(a_,ax,R1)        ddpp(a_,bx,R2,b_,cx) \                             ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_)#undef bla#define bla(a_,b_)      Mjoin(bla,NDP)(a_,b_)#undef blb#define blb(a_,b_)      Mjoin(blb,NDP)(a_,b_)#undef bla11_2#define bla11_2(a_)    plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) #undef bla21_2#define bla21_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_)#undef bla31_2#define bla31_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \                          dp1_2(a_,cx,R3) wa1_2(a_)#undef bla41_2#define bla41_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \                          ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_)#undef bla1_2#define bla1_2(a_)     Mjoin(Mjoin(bla,NDP),1_2)(a_)#undef bla11_4#define bla11_4(a_)    plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) #undef bla21_4#define bla21_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_)#undef bla31_4#define bla31_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \                          dp1_4(a_,cx,R3) wa1_4(a_)#undef bla41_4#define bla41_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \                          ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_)#undef bla1_4#define bla1_4(a_)     Mjoin(Mjoin(bla,NDP),1_4)(a_)#undef inc1#define inc1(a_)        a(a_,si) a(a_,ax)#undef inc2#define inc2(a_)        inc1(a_) a(a_,bx)#undef inc3#define inc3(a_)        inc2(a_) a(a_,cx)#undef inc4#define inc4(a_)        inc3(a_) a(a_,dx)#undef inc#define inc(a_)         Mjoin(inc,NDP)(a_)#ifdef PREFETCH/* #include "camm_arith.h" */#undef S#define S(a_,b_) (a_) + (b_)#undef PF1#define PF1 PREFETCH#undef PF2#define PF2 S(PF1,32)#undef PF3#define PF3 S(PF1,64)#undef PF4#define PF4 S(PF1,96)#undef PF5#define PF5 S(PF1,128)#undef PF6#define PF6 S(PF1,160)#undef PF7#define PF7 S(PF1,192)#undef PF8#define PF8 S(PF1,224)#else#undef PF1#define PF1 64#undef PF2#define PF2 96#undef PF3#define PF3 128#undef PF4#define PF4 160#undef PF5#define PF5 192#undef PF6#define PF6 224#undef PF7#define PF7 256#undef PF8#define PF8 288#endif#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER)#undef pf#define pf(a_,b_)  f(t0,a_,b_)#else#undef pf#define pf(a_,b_)  f(nta,a_,b_)#endif#undef bl1#define bl1            bla1_4(0x0) inc(4)#undef bl2#define bl2            bla1_2(0x0) inc(8)#undef bl4#define bl4            bla(0x0,PF1) inc(16)#undef bl8#define bl8            bla(0x0,PF1) blb(0x1,PF1) inc(32) #undef bl16#define bl16           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64)#undef bl32#define bl32           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \                       bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128)#undef bl64#define bl64           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \                       bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \                       bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \                       bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256)/* #define in2           inc(8) *//* #define in4           inc(16) *//* #define in8           inc(32) *//* #define in16          inc(64) */#undef in2#define in2  #undef in4#define in4  #undef in8#define in8  #undef in16#define in16 #ifdef NO_TRANSPOSE#undef incf#define incf           ra(di,si)#else#undef incf#define incf#endif#undef lf1#define lf1            mpx(R1)#undef lf2#define lf2            lf1 incf mpx(R2)#undef lf3#define lf3            lf2 incf mpx(R3)#undef lf4#define lf4            lf3 incf mpx(R4)#undef lf#define lf             Mjoin(lf,NDP)#undef ulf1#define ulf1           ulfa(R1)#undef ulf2#define ulf2           ulf1 ra(di,si) ulfa(R2) #undef ulf3#define ulf3           ulf2 ra(di,si) ulfa(R3) #undef ulf4#define ulf4           ulf3 ra(di,si) ulfa(R4) #undef ulf#define ulf            Mjoin(ulf,NDP)#undef lpba#define lpba(a_)      "movl %%esi,%%e" #a_ "\n\t"#undef lpb1#define lpb1          lpba(ax)#undef lpb2#define lpb2          lpb1 ra(di,si) lpba(bx)#undef lpb3#define lpb3          lpb2 ra(di,si) lpba(cx)#undef lpb4#define lpb4          lpb3 ra(di,si) lpba(dx)#undef lpb#define lpb           Mjoin(lpb,NDP)#undef ipf1#define ipf1(a_)   pf(a_,si) pf(a_,ax)#undef ipf2#define ipf2(a_)   ipf1(a_)  pf(a_,bx) #undef ipf3#define ipf3(a_)   ipf2(a_)  pf(a_,cx) #undef ipf4#define ipf4(a_)   ipf3(a_)  pf(a_,dx) #undef ipf#define ipf(a_)     Mjoin(ipf,NDP)(a_)#ifdef LUNROLL#undef UNROLL#ifdef SREAL#undef UNROLL#define UNROLL LUNROLL#elif defined(DREAL) || defined(SCPLX)#undef UNROLL#define UNROLL LUNROLL*2#elif defined(DCPLX)#undef UNROLL#define UNROLL LUNROLL*4#endif#else#undef UNROLL#define UNROLL 16#endif#undef UNROLL1_2#if UNROLL == 64#undef blUNROLL#define blUNROLL bl64#undef UNROLL1_2#define UNROLL1_2 32#elif UNROLL == 32#undef blUNROLL#define blUNROLL bl32#undef UNROLL1_2#define UNROLL1_2 16#elif UNROLL == 16#undef blUNROLL#define blUNROLL bl16#undef UNROLL1_2#define UNROLL1_2 8#elif UNROLL == 8#undef blUNROLL#define blUNROLL bl8#undef UNROLL1_2#define UNROLL1_2 4#elif UNROLL == 4#undef blUNROLL#define blUNROLL bl4#undef UNROLL1_2#define UNROLL1_2 2#elif UNROLL == 2#undef blUNROLL#define blUNROLL bl2#undef UNROLL1_2#define UNROLL1_2 1#elif UNROLL == 1#undef blUNROLL#define blUNROLL bl1#undef UNROLL1_2#define UNROLL1_2 stop#endif#ifndef UNROLL1_2#error UNROLL must be set to power of 2 < 128#endif#ifdef GER#undef aconst#define aconst#undef cconst#define cconst const#else#undef aconst#define aconst const#undef cconst#define cconst#endif#undef MY_FUNCTION#define MY_FUNCTION Mjoin(dp,EXT)static voidMY_FUNCTION(aconst TYPE *a,int lda,	      const TYPE *b,	      cconst TYPE *c,int stride,int len) {#ifdef SCPLX#if defined(GER) && defined(Conj_)    const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1;#else    const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1;#endif#endif#if defined(DCPLX) && defined(ATL_SSE2)#if defined(GER) && defined(Conj_)    const TYPE w1[1]={{-1.0,1.0}},*w=w1;#else    const TYPE w1[1]={{1.0,-1.0}},*w=w1;#endif#endif#ifdef NO_TRANSPOSE#undef movm#define movm c#undef fixm#define fixm b#else#undef movm#define movm b#undef fixm#define fixm c#endif        NO_INLINE    unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float);    ASM (	 "pushl %%ebx\n\t"	 a(4,sp)#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))	 "movl %6,%%esi\n\t"	 pl(0,si,SREG)#endif	 #ifdef NO_TRANSPOSE	 "movl %1,%%esi\n\t"  /* fixm */	 "movl %2,%%edi\n\t"  /* fixm2fixm */#endif	 lf	 "movl %3,%%esi\n\t"  /* a */	 "movl %4,%%edi\n\t"  /* a2a */	 lpb	 ipf(0)	 "movl %0,%%esi\n\t"  /* movm */	 "movl %5,%%edi\n\t"  /* len */#if defined(ALIGN)#if defined(SREAL)	 test(4,ax)	 je(Mjoin(a1,EXT))	 test(-1,di)	 je(Mjoin(a1,EXT))	 sub(1,di)	 bl1	 lab(Mjoin(a1,EXT))#endif#if defined(DREAL) || defined(SREAL)	 test(8,ax)	 je(Mjoin(as,EXT))	 test(-2,di)	 je(Mjoin(as,EXT))	 sub(2,di)	 bl2	 lab(Mjoin(as,EXT))#endif#endif	      	 ipf(32)	 lab(Mjoin(loop,EXT))	 test(-UNROLL,di)	 je(Mjoin(UNROLL1_2,EXT))	 sub(UNROLL,di)	 blUNROLL	 	 jmp(Mjoin(loop,EXT))#if UNROLL > 32	 lab(Mjoin(32,EXT))	 test(32,di)	 je(Mjoin(16,EXT))	 bl32#endif	 #if UNROLL > 16	 lab(Mjoin(16,EXT))	 test(16,di)	 je(Mjoin(8,EXT))	 bl16#endif	 #if UNROLL > 8	 lab(Mjoin(8,EXT))	 test(8,di)	 je(Mjoin(4,EXT))	 bl8#endif	 #if UNROLL > 4	 lab(Mjoin(4,EXT))	 test(4,di)	 je(Mjoin(2,EXT))	 bl4#endif#if UNROLL > 2	 	 lab(Mjoin(2,EXT))#ifndef DCPLX	 test(2,di)	 je(Mjoin(1,EXT))	 bl2#endif#endif#if UNROLL > 1	 lab(Mjoin(1,EXT))#ifdef SREAL	 test(1,di)	 je(Mjoin(stop,EXT))	 bl1#endif#endif	 lab(Mjoin(stop,EXT))#ifndef NO_TRANSPOSE	 "movl %1,%%esi\n\t"  /* fixm */	 "movl %2,%%edi\n\t"  /* fixm2fixm */#endif	 ulf	 a(-4,sp)	 "popl %%ebx\n\t"	 ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3)#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))	 ,"m" (w)#endif	 :"ax","bx","cx","dx","si","di");}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -