📄 fftsse.nas
字号:
xorps xmm4,[Q_MPMP] subps xmm6,xmm4 ; = {--, --, s1, c1}; c3 = c1*t_c - s1*t_s;; s3 = s1*t_c + c1*t_s; shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1} mulps xmm0,xmm6 movhlps xmm3,xmm0 xorps xmm3,[Q_MPMP] subps xmm0,xmm3 ; = {--, --, s3, c3} unpcklps xmm6,xmm0 ; xmm6 = {s3, s1, c3, c1} shufps xmm6,xmm6,0xB4 ; xmm6 = {s1, s3, c3, c1}; c2 = c1*c1 - s1*s1;; c4 = c3*c3 - s3*s3;; s4 = s3*c3 + s3*c3;; s2 = s1*c1 + s1*c1; movaps xmm7,xmm6 movaps xmm4,xmm6 shufps xmm7,xmm7,0x14 shufps xmm4,xmm4,0xEB xorps xmm4,[Q_MMPP] ; = {-c3,-c1, s3, s1} mulps xmm7,xmm6 mulps xmm4,xmm6 shufps xmm4,xmm4,0x1B addps xmm7,xmm4 ; xmm7 = {s2, s4, c4, c2}; fi = fz +i;; gi = fz +k1-i; mov edi,[esp+_P+4] ; = fz mov esi,ebx shr esi,1 sub edi,esi ; edi = fz - i/2 lea esi,[edi + ebx] ; esi = fi = fz +i/2 add edi,eax ; edi = gi = fz +k1-i/2 sub edi,4; do{.lp220:; unroll稿のdo loopは51+4炭吾; a = c2*fi[k1 ] + s2*gi[k1 ];; e = c4*fi[k1+1] + s4*gi[k1-1];; f = s4*fi[k1+1] - c4*gi[k1-1];; b = s2*fi[k1 ] - c2*gi[k1 ];; c = c2*fi[k3 ] + s2*gi[k3 ];; g = c4*fi[k3+1] + s4*gi[k3-1];; h = s4*fi[k3+1] - c4*gi[k3-1];; d = s2*fi[k3 ] - c2*gi[k3 ]; movaps xmm4,xmm7 ; xmm7 = {s2, s4, c4, c2} shufps xmm4,xmm4,0x1B xorps xmm4,[Q_MMPP] movlps xmm0,[esi+eax] movlps xmm1,[edi+eax] movlps xmm2,[esi+edx] movlps xmm3,[edi+edx] shufps xmm0,xmm0,0x14 shufps xmm1,xmm1,0x41 shufps xmm2,xmm2,0x14 shufps xmm3,xmm3,0x41 mulps xmm0,xmm7 mulps xmm1,xmm4 mulps xmm2,xmm7 mulps xmm3,xmm4 addps xmm0,xmm1 ; xmm0 = {b, f, e, a} addps xmm2,xmm3 ; xmm2 = {d, h, g, c};17; f0 = fi[0 ] + a;; f4 = fi[0 +1] + e;; g4 = gi[0 -1] + f;; g0 = gi[0 ] + b;; f1 = fi[0 ] - a;; f5 = fi[0 +1] - e;; g5 = gi[0 -1] - f;; g1 = gi[0 ] - b;; f2 = fi[k2 ] + c;; f6 = fi[k2+1] + g;; g6 = gi[k2-1] + h;; g2 = gi[k2 ] + d;; f3 = fi[k2 ] - c;; f7 = fi[k2+1] - g;; g7 = gi[k2-1] - h;; g3 = gi[k2 ] - d; movlps xmm4,[esi ] movhps xmm4,[edi ] movaps xmm1,xmm4 subps xmm1,xmm0 ; xmm1 = {g1, g5, f5, f1} movlps xmm5,[esi+eax*2] movhps xmm5,[edi+eax*2] movaps xmm3,xmm5 subps xmm3,xmm2 ; xmm3 = {g3, g7, f7, f3} addps xmm0,xmm4 ; xmm0 = {g0, g4, f4, f0} addps xmm2,xmm5 ; xmm2 = {g2, g6, f6, f2};10; a = c1*f2 + s1*g3; 界*界 + 嫡*嫡; e = c3*f6 + s3*g7;; g = s3*g6 + c3*f7;; c = s1*g2 + c1*f3;; d = c1*g2 - s1*f3; 界*嫡 - 嫡*界; h = c3*g6 - s3*f7;; f = s3*f6 - c3*g7;; b = s1*f2 - c1*g3; movaps xmm5,xmm6 ; xmm6 = {s1, s3, c3, c1} shufps xmm5,xmm5,0x1B ; = {c1, c3, s3, s1} movaps xmm4,xmm2 mulps xmm4,xmm6 shufps xmm2,xmm2,0x1B ; xmm2 = {f2, f6, g6, g2} mulps xmm2,xmm6 mulps xmm5,xmm3 mulps xmm3,xmm6 shufps xmm3,xmm3,0x1B addps xmm4,xmm3 ; = {c, g, e, a} subps xmm2,xmm5 ; = {b, f, h, d};10; fi[0 ] = f0 + a;; fi[0 +1] = f4 + e;; gi[0 -1] = g4 + g;; gi[0 ] = g0 + c;; fi[k2 ] = f0 - a;; fi[k2+1] = f4 - e;; gi[k2-1] = g4 - g;; gi[k2 ] = g0 - c;; fi[k1 ] = f1 + d;; fi[k1+1] = f5 + h;; gi[k1-1] = g5 + f;; gi[k1 ] = g1 + b;; fi[k3 ] = f1 - d;; fi[k3+1] = f5 - h;; gi[k3-1] = g5 - f;; gi[k3 ] = g1 - b; movaps xmm3,xmm0 subps xmm0,xmm4 movlps [esi+eax*2],xmm0 movhps [edi+eax*2],xmm0 addps xmm4,xmm3 movlps [esi ],xmm4 movhps [edi ],xmm4 movaps xmm5,xmm1 subps xmm1,xmm2 movlps [esi+edx ],xmm1 movhps [edi+edx ],xmm1 addps xmm2,xmm5 movlps [esi+eax ],xmm2 movhps [edi+eax ],xmm2; 14; gi += k4;; fi += k4; lea edi,[edi + eax*4] ; gi += (k1 * 4); lea esi,[esi + eax*4] ; fi += (k1 * 4); cmp esi,ebp jl near .lp220 ; while (fi<fn);; } while (fi<fn); add ebx,4*4 ; i+= 4 cmp ebx,eax ; i < k1 jl near .lp22; }.F22: cmp eax,[esp+_P+8] ; while ((k1 * 4)<n); jl near .lp2 pop ebp pop edi pop esi pop ebx ret;------------------------------------------------------------------------; 99/11/12 Initial version for SSE by K. SAKAI, 4300clk@P3; This routine is very slow when wsamp_r_int is not aligned to 16byte boundary.;;void fft_side_SSE( float in[2][1024], int s, float *ret); energy = (in[0][512] - in[1][512])^2;; energy = (in[0][1024-s] - in[1][1024-s])^2;; for (i=s,j=1024-s;i<512;i++,j--){; a = in[0][i] - in[1][i];; energy += a*a;; b = in[0][j-1] - in[1][j-1];; energy += b*b;; }; *ret = energy * 0.25; align 16fft_side_SSE: mov ecx,[esp+8] ; = i = s mov edx,1024 sub edx,ecx ; = j mov eax,[esp+4] ; = in movss xmm7,[eax+1024*0*4+512*4] movss xmm1,[eax+1024*1*4+512*4] subss xmm7,xmm1 mulss xmm7,xmm7 movss xmm2,[eax+1024*0*4+edx*4] movss xmm3,[eax+1024*1*4+edx*4] subss xmm2,xmm3 mulss xmm2,xmm2 addss xmm7,xmm2 test cl,1 jz .even.odd: dec edx movss xmm0,[eax+1024*0*4+ecx*4] movss xmm1,[eax+1024*1*4+ecx*4] inc ecx movss xmm2,[eax+1024*0*4+edx*4] movss xmm3,[eax+1024*1*4+edx*4] cmp ecx,edx subss xmm0,xmm1 subss xmm2,xmm3 mulss xmm0,xmm0 mulss xmm2,xmm2 addss xmm7,xmm0 addss xmm7,xmm2 je near .exit1.even: test cl,2 jz .f0 sub edx,2 movlps xmm0,[eax+1024*0*4+ecx*4] movlps xmm1,[eax+1024*1*4+ecx*4] add ecx,2 movhps xmm0,[eax+1024*0*4+edx*4] movhps xmm1,[eax+1024*1*4+edx*4] cmp ecx,edx subps xmm0,xmm1 mulps xmm0,xmm0 addps xmm7,xmm0 je .exit4 jmp short .f0 align 16.f0:.lp0: sub edx,4 movaps xmm0,[eax+1024*0*4+ecx*4] movaps xmm1,[eax+1024*1*4+ecx*4] add ecx,4 subps xmm0,xmm1 mulps xmm0,xmm0 addps xmm7,xmm0 movaps xmm2,[eax+1024*0*4+edx*4] movaps xmm3,[eax+1024*1*4+edx*4] cmp ecx,edx subps xmm2,xmm3 mulps xmm2,xmm2 addps xmm7,xmm2 jne .lp0.exit4: movhlps xmm6,xmm7 addps xmm7,xmm6 movaps xmm6,xmm7 shufps xmm6,xmm6,01010101B addss xmm7,xmm6.exit1: mulss xmm7,[S_025] mov eax,[esp+12] movss [eax],xmm7 ret end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -