📄 fftsse.nas
字号:
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA; GOGO-no-coda; Copyright (C) 1999 shigeo; special thanks to Keiichi SAKAI%include "nasm.h" globaldef fht_SSE globaldef fft_side_SSE externdef costab_fft externdef sintab_fft segment_data align 16Q_MMPP dd 0x0,0x0,0x80000000,0x80000000Q_MPMP dd 0x0,0x80000000,0x0,0x80000000Q_002 dd 0.02236068, 0.02236068, 0.02236068, 0.02236068D_SQRT2 dd 1.414213562,1.414213562S_025 dd 0.25S_05 DD 0.5S_00005 DD 0.0005 segment_code;------------------------------------------------------------------------; by K. SAKAI; 99/08/18 PIII 23k[clk]; 99/08/19 炭吾界进掐れ垂え PIII 22k[clk]; 99/08/20 bit reversal を奠羔稿から败竣した PIII 17k[clk]; 99/08/23 办婶 unroll PIII 14k[clk]; 99/11/12 clean up;;void fht_SSE(float *fz, int n); align 16fht_SSE: push ebx push esi push edi push ebp%assign _P 4*4 ;2つ誊のル〖プ mov eax,[esp+_P+4] ;eax=fz mov ebp,[esp+_P+8] ;=n shl ebp,2 add ebp,eax ; fn = fz + n, この簇眶姜位まで稍恃 xor ecx,ecx ; ecx=k=0 xor eax,eax mov al,4 ; =k1=1*(sizeof float) // 4, 16, 64, 256,... xor edx,edx mov dl,12 ; =k3=3*k1 jmp short .lp2 align 16.lp2: ; do{ add cl,2 ; k += 2; shl eax,2 shl edx,2 mov esi,[esp+_P+4] ;esi=fi=fz mov edi,eax shr edi,1 add edi,esi ; edi=gi=fi+ki/2; たかだか2事误しか袋略できない婶尸はFPUのほうが庐い。 movss xmm7,[D_SQRT2] jmp short .lp20 align 16.lp20: ; do{; f0 = fi[0 ] + fi[k1];; f2 = fi[k2] + fi[k3];; f1 = fi[0 ] - fi[k1];; f3 = fi[k2] - fi[k3];; fi[0 ] = f0 + f2;; fi[k1] = f1 + f3;; fi[k2] = f0 - f2;; fi[k3] = f1 - f3; fld dword [esi] fadd dword [esi+eax] fld dword [esi+eax*2] fadd dword [esi+edx] fld dword [esi] fsub dword [esi+eax] fld dword [esi+eax*2] fsub dword [esi+edx] fld st1 fadd st0,st1 fstp dword [esi+eax] fsubp st1,st0 fstp dword [esi+edx] fld st1 fadd st0,st1 fstp dword [esi] fsubp st1,st0 fstp dword [esi+eax*2] lea esi,[esi + eax*4] ; = fi += (k1 * 4);; add esi,eax; add esi,edx; g0 = gi[0 ] + gi[k1];; g2 = SQRT2 * gi[k2];; g1 = gi[0 ] - gi[k1];; g3 = SQRT2 * gi[k3];; gi[0 ] = g0 + g2;; gi[k2] = g0 - g2;; gi[k1] = g1 + g3;; gi[k3] = g1 - g3; fld dword [edi] fadd dword [edi+eax] fld dword [D_SQRT2] fmul dword [edi+eax*2] fld dword [edi] fsub dword [edi+eax] fld dword [D_SQRT2] fmul dword [edi+edx] fld st1 fadd st0,st1 fstp dword [edi+eax] fsubp st1,st0 fstp dword [edi+edx] fld st1 fadd st0,st1 fstp dword [edi] fsubp st1,st0 fstp dword [edi+eax*2] lea edi,[edi + eax*4] ; = gi += (k1 * 4); cmp esi,ebp jl near .lp20 ; while (fi<fn);; i = 1; //for (i=1;i<kx;i++){; c1 = 1.0*t_c - 0.0*t_s;; s1 = 0.0*t_c + 1.0*t_s; movss xmm6,[costab_fft + ecx*4] movss xmm1,[sintab_fft + ecx*4] shufps xmm6,xmm1,0x00 ; = {s1, s1, c1, c1} shufps xmm6,xmm6,0x28 ; = {+c1, +s1, +s1, +c1}; c2 = c1*c1 - s1*s1;; s2 = c1*s1 + s1*c1; movaps xmm4,xmm6 movaps xmm7,xmm6 unpcklps xmm4,xmm4 ; = {s1, s1, c1, c1} shufps xmm7,xmm7,0x41 mulps xmm4,xmm6 ; = {s1*c1, s1*s1, c1*s1, c1*c1} xorps xmm7,[Q_MMPP] ; = {-s1, -c1, +c1, +s1} movhlps xmm3,xmm4 xorps xmm3,[Q_MPMP] subps xmm4,xmm3 ; = {--, --, s2, c2} movlhps xmm4,xmm4 ; = {+s2, +c2, +s2, +c2} movaps xmm5,xmm4 shufps xmm5,xmm5,0x11 xorps xmm5,[Q_MPMP] ; = {-c2, +s2, -c2, +s2} mov esi,[esp+_P+4] ; = fz lea edi,[esi + eax - 4] ; edi = gi = fz +k1-i add esi,4 ; esi = fi = fz + i jmp short .lp21 align 16.lp21: ; do{; a = c2*fi[k1] + s2*gi[k1];; b = s2*fi[k1] - c2*gi[k1];; c = c2*fi[k3] + s2*gi[k3];; d = s2*fi[k3] - c2*gi[k3];; f0 = fi[0 ] + a;; g0 = gi[0 ] + b;; f2 = fi[k1 * 2] + c;; g2 = gi[k1 * 2] + d;; f1 = fi[0 ] - a;; g1 = gi[0 ] - b;; f3 = fi[k1 * 2] - c;; g3 = gi[k1 * 2] - d; movss xmm0,[esi + eax] ; = fi[k1] movss xmm2,[esi + edx] ; = fi[k3] shufps xmm0,xmm2,0x00 ; = {fi[k3], fi[k3], fi[k1], fi[k1]} movss xmm1,[edi + eax] ; = fi[k1] movss xmm3,[edi + edx] ; = fi[k3] shufps xmm1,xmm3,0x00 ; = {gi[k3], gi[k3], gi[k1], gi[k1]} movss xmm2,[esi] ; = fi[0] mulps xmm0,xmm4 ; *= {+s2, +c2, +s2, +c2} movss xmm3,[esi + eax*2] ; = fi[k2] unpcklps xmm2,xmm3 ; = {--, --, fi[k2], fi[0]} mulps xmm1,xmm5 ; *= {-c2, +s2, -c2, +s2} movss xmm3,[edi + eax*2] ; = gi[k2] addps xmm0,xmm1 ; = {d, c, b, a} movss xmm1,[edi] ; = gi[0] unpcklps xmm1,xmm3 ; = {--, --, gi[k2], gi[0]} unpcklps xmm2,xmm1 ; = {gi[k2], fi[k2], gi[0], fi[0]} movaps xmm1,xmm2 addps xmm1,xmm0 ; = {g2, f2, g0, f0} subps xmm2,xmm0 ; = {g3, f3, g1, f1}; a = c1*f2 + s1*g3;; c = s1*g2 + c1*f3;; b = s1*f2 - c1*g3;; d = c1*g2 - s1*f3;; fi[0 ] = f0 + a;; gi[0 ] = g0 + c;; gi[k1] = g1 + b;; fi[k1] = f1 + d;; fi[k1 * 2] = f0 - a;; gi[k1 * 2] = g0 - c;; gi[k3] = g1 - b;; fi[k3] = f1 - d; movaps xmm3,xmm1 movhlps xmm1,xmm1 ; = {g2, f2, g2, f2} shufps xmm3,xmm2,0x14 ; = {f1, g1, g0, f0} mulps xmm1,xmm6 ; *= {+c1, +s1, +s1, +c1} shufps xmm2,xmm2,0xBB ; = {f3, g3, f3, g3} mulps xmm2,xmm7 ; *= {-s1, -c1, +c1, +s1} addps xmm1,xmm2 ; = {d, b, c, a} movaps xmm2,xmm3 addps xmm3,xmm1 ; = {fi[k1], gi[k1], gi[0], fi[0]} subps xmm2,xmm1 ; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]} movhlps xmm0,xmm3 movss [esi],xmm3 shufps xmm3,xmm3,0x55 movss [edi+eax],xmm0 shufps xmm0,xmm0,0x55 movss [edi],xmm3 movss [esi+eax],xmm0 movhlps xmm0,xmm2 movss [esi+eax*2],xmm2 shufps xmm2,xmm2,0x55 movss [edi+edx],xmm0 shufps xmm0,xmm0,0x55 movss [edi+eax*2],xmm2 lea edi,[edi + eax*4] ; gi += (k1 * 4); movss [esi+edx],xmm0 lea esi,[esi + eax*4] ; fi += (k1 * 4); cmp esi,ebp jl near .lp21 ; while (fi<fn);; unroll涟のdo loopは43+4炭吾; 呵柒件ではないforル〖プをunrollingした; kx= 2, 8, 32, 128; k4= 16, 64, 256, 1024; 0, 6/2,30/2,126/2; at here; xmm6 = {--, --, s1, c1}; c3 = c1; s3 = s1; xor ebx,ebx mov bl,4*4 ; = i = 4 cmp ebx,eax ; i < k1 jnl near .F22 shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1} jmp short .F220 align 16; for (i=4;i<k1;i+=4){ // for (i=2;i<k1/2;i+=2){.lp22: shufps xmm6,xmm6,0x69 ; xmm6 = {c3, s3, s3, c3}.F220:; at here, xmm6 is {c3, s3, s3, c3}; c1 = c3*t_c - s3*t_s;; s1 = c3*t_s + s3*t_c; movss xmm0,[costab_fft + ecx*4] movss xmm1,[sintab_fft + ecx*4] shufps xmm0,xmm1,0x00 ; = {t_s, t_s, t_c, t_c} mulps xmm6,xmm0 movhlps xmm4,xmm6
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -