📄 fft3dn.nas
字号:
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA; GOGO-no-coda; Copyright (C) 1999 shigeo; special thanks to URURI%include "nasm.h" externdef costab_fft externdef sintab_fft segment_data align 32D_1_41421 dd 1.41421356 , 1.41421356D_1_0 dd 1.0 , 1.0D_0_5 dd 0.5 , 0.5D_0_25 dd 0.25 , 0.25D_0_02236 dd 0.02236067 , 0.02236067D_0_0005 dd 0.0005 , 0.0005D_0_0 dd 0.0 , 0.0D_1_0_D_0_0 dd 0.0 , 1.0D_0_0_D_1_0 dd 1.0 , 0.0D_MSB1_0 dd 0x00000000 , 0x80000000D_MSB1_1 dd 0x80000000 , 0x80000000D_MSB0_1 dd 0x80000000 , 0x00000000 segment_code;void fht_3DN2(float *fz, int n);proc fht_3DN2%$fz arg 4%$n arg 4%$k local 4%$Ps2_Pc2 local 8%$Mc2_Ps2 local 8%$t_s local 8%$t_c local 8 alloc femms pushd ebp, ebx, esi, edifht_3DN_1st_part:fht_3DN_2nd_part:fht_3DN_3rd_part:.do_init: mov r3, 16 ;k1*fsize = 4*fsize = k4 mov r4, 8 ;kx = k1/2 mov r2, 48 ;k3*fsize mov dword [sp(%$k)], 2 ;k = 2 mov r0, [sp(%$fz)] ;fi lea r1, [r0+8] ;gi = fi + kx jmp .do align 16.do: pmov mm6, [D_MSB1_0] ;MSB1_0 pmov mm7, [D_1_41421].do2: ;f pmov mm1, [r0+r3] ;fi1 pmov mm4, [r0+r2] ;fi3 pmov mm0, [r0] ;fi0 pmov mm3, [r0+r3*2] ;fi2 pupldq mm1, mm1 pupldq mm4, mm4 pupldq mm0, mm0 ;fi0 | fi0 pupldq mm3, mm3 ;fi2 | fi2 pxor mm1, mm6 ;-fi1 | fi1 pxor mm4, mm6 ;-fi3 | fi3 pfadd mm0, mm1 ;f1 | f0 pfadd mm3, mm4 ;f3 | f2 pmov mm4, mm0 pfadd mm0, mm3 ;fi1 | fi0 pfsub mm4, mm3 ;fi3 | fi2 pmovd [r0], mm0 ;fi[0] puphdq mm0, mm0 pmovd [r0+r3*2], mm4 ;fi[k2] puphdq mm4, mm4 pmovd [r0+r3], mm0 ;fi[k1] pmovd [r0+r2], mm4 ;fi[k3] ;g pmov mm1, [r1+r3] ;gi1 pmov mm0, [r1] ;gi0 pmov mm3, [r1+r3*2] ;gi2 pmov mm5, [r1+r2] ;gi3 pupldq mm1, mm1 pupldq mm0, mm0 ;gi0 | gi0 pupldq mm3, mm5 ;gi3 | gi2 pxor mm1, mm6 ;-gi1 | gi1 pfadd mm0, mm1 ;g1 | g0 pfmul mm3, mm7 ;g3 | g2 pmov mm4, mm0 pfadd mm0, mm3 ;gi1 | gi0 pfsub mm4, mm3 ;gi3 | gi2 pmovd [r1], mm0 ;gi[0] puphdq mm0, mm0 pmovd [r1+r3*2], mm4 ;gi[k2] puphdq mm4, mm4 pmovd [r1+r3], mm0 ;gi[k1] pmovd [r1+r2], mm4 ;gi[k3] lea r0, [r0+r3*4] lea r1, [r1+r3*4] cmp r0, r6 jb near .do2 mov r0, [sp(%$k)] pmov mm0, [costab_fft +r0*4] pmov mm1, [sintab_fft +r0*4] pupldq mm0, mm0 pupldq mm1, mm1 pmov mm6, [D_1_0_D_0_0] ;c1 | s1 pmov mm7, [D_0_0_D_1_0] ;-s1 | c1 pmov [sp(%$t_c)], mm0 pmov [sp(%$t_s)], mm1.for_init: mov r5, 4 ;i = 1*fsize jmp .for align 16.for: pfmul mm6, [sp(%$t_c)] ;c1*t_c | s1*t_c pfmul mm7, [sp(%$t_s)] ;-s1*t_s | c1*t_s pfadd mm6, mm7 ;c1 | s1 pmov mm7, [D_MSB0_1] pmov mm1, mm6 pxor mm7, mm6 ;c1 | -s1 puphdq mm1, mm1 ;c1 pmov mm0, mm7 pupldq mm2, mm7 pfmul mm0, mm6 ;c1*c1 | -s1*s1 pfmul mm1, mm6 ;c1*s1 puphdq mm7, mm2 ;-s1 | c1 pfacc mm0, mm0 ;c2 pfadd mm1, mm1 ;s2 = 2*c1*s1 pupldq mm1, mm0 ;c2 | s2 pupldq mm0, mm1 ;s2 | c2 pxor mm1, [D_MSB1_0] ;-c2 | s2 pmov [sp(%$Ps2_Pc2)], mm0 pmov [sp(%$Mc2_Ps2)], mm1 mov r0, [sp(%$fz)] mov r1, [sp(%$fz)] add r0, r5 ;r0 = fi add r1, r3 sub r1, r5 ;r1 = gi jmp .do3 align 16.do3: pmov mm2, [r0+r3] pmov mm4, [r1+r3] pmov mm3, [r0+r2] pmov mm5, [r1+r2] pupldq mm2, mm2 pupldq mm4, mm4 pupldq mm3, mm3 pupldq mm5, mm5 pmov mm0, [sp(%$Ps2_Pc2)] pmov mm1, [sp(%$Mc2_Ps2)] pfmul mm2, mm0 ;s2 * fi1 | c2 * fi1 pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1 pfmul mm3, mm0 ;s2 * fi3 | c2 * fi3 pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3 pfadd mm2, mm4 ;b | a pfadd mm3, mm5 ;d | c pmov mm0, [r0] pmov mm4, [r1] pmov mm1, [r0+r3*2] pmov mm5, [r1+r3*2] pupldq mm0, mm4 ;gi0 | fi0 pupldq mm1, mm5 ;gi2 | fi2 pmov mm4, mm2 pmov mm5, mm3 pfadd mm2, mm0 ;g0 | f0 pfadd mm3, mm1 ;g2 | f2 pfsub mm0, mm4 ;g1 | f1 pfsub mm1, mm5 ;g3 | f3 pmov mm4, mm3 pmov mm5, mm1 pupldq mm4, mm4 ;f2 | f2 puphdq mm5, mm5 ;g3 | g3 puphdq mm3, mm3 ;g2 | g2 pupldq mm1, mm1 ;f3 | f3 pfmul mm4, mm6 ;f2 * c1 | f2 * s1 pfmul mm5, mm7 ;g3 * -s1 | g3 * c1 pfmul mm3, mm6 ;g2 * c1 | g2 * s1 pfmul mm1, mm7 ;f3 * -s1 | f3 * c1 pfsub mm4, mm5 ;a | b pfadd mm3, mm1 ;d | c pmov mm5, mm2 pmov mm1, mm0 pupldq mm2, mm2 ;f0 | f0 pupldq mm0, mm0 ;f1 | f1 puphdq mm1, mm2 ;f0 | g1 puphdq mm5, mm0 ;f1 | g0 pmov mm2, mm4 pmov mm0, mm3 pfadd mm4, mm1 ;fi0 | gi1 pfadd mm3, mm5 ;fi1 | gi0 pfsub mm1, mm2 ;fi2 | gi3 pfsub mm5, mm0 ;fi3 | gi2 pmovd [r1+r3], mm4 ;gi[k1] puphdq mm4, mm4 pmovd [r1], mm3 ;gi[0] puphdq mm3, mm3 pmovd [r1+r2], mm1 ;gi[k3] puphdq mm1, mm1 pmovd [r1+r3*2], mm5 ;gi[k2] puphdq mm5, mm5 pmovd [r0], mm4 ;fi[0] pmovd [r0+r3], mm3 ;fi[k1] pmovd [r0+r3*2], mm1 ;fi[k2] pmovd [r0+r2], mm5 ;fi[k3] lea r0, [r0+r3*4] lea r1, [r1+r3*4] cmp r0, r6 jb near .do3 add r5, 4 cmp r5, r4 jb near .for cmp r3, [sp(%$n)] jae .exit add dword [sp(%$k)], 2 ;k += 2; lea r3, [r3*4] ;k1 *= 4 lea r2, [r2*4] ;k3 *= 4 lea r4, [r4*4] ;kx *= 4 mov r0, [sp(%$fz)] ;fi lea r1, [r0+r4] ;gi = fi + kx jmp .do.exit: femms popd ebp, ebx, esi, ediendproc;***********************************************************************%ifdef USE_E3DN;void fht_E3DN(float *fz, int n);proc fht_E3DN%$fz arg 4%$n arg 4%$k local 4%$Ps2_Pc2 local 8%$Mc2_Ps2 local 8%$t_s local 8%$t_c local 8 alloc femms pushd ebp, ebx, esi, edifht_E3DN_1st_part:fht_E3DN_2nd_part:fht_E3DN_3rd_part:.do_init: mov r3, 16 ;k1*fsize = 4*fsize = k4 mov r4, 8 ;kx = k1/2 mov r2, 48 ;k3*fsize mov dword [sp(%$k)], 2 ;k = 2 mov r0, [sp(%$fz)] ;fi lea r1, [r0+8] ;gi = fi + kx jmp .do align 16.do: pmov mm7, [D_1_41421].do2: pmov mm0, [r0] ;fi0 pupldq mm0, [r0+r3] ;fi1 | fi0 pmov mm1, [r0+r3*2] ;fi2 pupldq mm1, [r0+r2] ;fi3 | fi2 pmov mm3, [r1] ;gi0 pupldq mm3, [r1+r3] ;gi1 | gi0 pmov mm4, [r1+r2] ;gi3 pupldq mm4, [r1+r3*2] ;gi2 | gi3 pfpnacc mm0, mm0 ;f0 | f1 pfpnacc mm1, mm1 ;f2 | f3 pfpnacc mm3, mm3 ;g0 | g1 pfmul mm4, mm7 ;g2 | g3 pmov mm2, mm0 pfadd mm0, mm1 ;fi0 | fi1 pfsub mm2, mm1 ;fi2 | fi3 pmov mm5, mm3 pfadd mm3, mm4 ;gi0 | gi1 pfsub mm5, mm4 ;gi2 | gi3 pmovd [r0+r3], mm0 ;fi[k1] puphdq mm0, mm0 pmovd [r0+r2], mm2 ;fi[k3] puphdq mm2, mm2 pmovd [r1+r3], mm3 ;gi[k1] puphdq mm3, mm3 pmovd [r1+r2], mm5 ;gi[k3] puphdq mm5, mm5 pmovd [r0], mm0 ;fi[0] pmovd [r0+r3*2], mm2 ;fi[k2] pmovd [r1], mm3 ;gi[0] pmovd [r1+r3*2], mm5 ;gi[k2] lea r0, [r0+r3*4] lea r1, [r1+r3*4] cmp r0, r6 jb near .do2 mov r0, [sp(%$k)] pmov mm0, [costab_fft +r0*4] pmov mm1, [sintab_fft +r0*4] pupldq mm0, mm0 pupldq mm1, mm1 pmov mm6, [D_1_0_D_0_0] ;c1 | s1 pmov mm7, [D_0_0_D_1_0] ;-s1 | c1 pmov [sp(%$t_c)], mm0 pmov [sp(%$t_s)], mm1.for_init: mov r5, 4 ;i = 1*fsize jmp .for align 16.for: pfmul mm6, [sp(%$t_c)] ;c1*t_c | s1*t_c pfmul mm7, [sp(%$t_s)] ;-s1*t_s | c1*t_s pfadd mm6, mm7 ;c1 | s1 pmov mm7, [D_MSB0_1] pswapd mm1, mm6 ;s1 | c1 pswapd mm0, mm6 pxor mm7, mm6 ;c1 | -s1 pfmul mm1, mm6 ;c1*s1 | c1*s1 pfmul mm0, mm0 ;s1*s1 | c1*c1 pswapd mm7, mm7 ;-s1 | c1 pfpnacc mm0, mm1 ;s2 = 2*c1*s1 | c2 = c1*c1-s1*s1 pswapd mm1, mm0 ;c2 | s2 pxor mm1, [D_MSB1_0] ;-c2 | s2 pmov [sp(%$Ps2_Pc2)], mm0 pmov [sp(%$Mc2_Ps2)], mm1 mov r0, [sp(%$fz)] mov r1, [sp(%$fz)] add r0, r5 ;r0 = fi add r1, r3 sub r1, r5 ;r1 = gi jmp .do3 align 16.do3: pmov mm0, [r0+r2] pmov mm2, [r1+r2] pmov mm1, [r0+r3] pmov mm3, [r1+r3] pupldq mm0, mm0 pupldq mm2, mm2 pupldq mm1, mm1 pupldq mm3, mm3 pmov mm4, [sp(%$Ps2_Pc2)] pmov mm5, [sp(%$Mc2_Ps2)] pfmul mm0, mm4 ;s2 * fi3 | c2 * fi3 pfmul mm2, mm5 ;-c2 * gi3 | s2 * gi3 pfmul mm1, mm4 ;s2 * fi1 | c2 * fi1 pfmul mm3, mm5 ;-c2 * gi1 | s2 * gi1 pfadd mm0, mm2 ;d | c pfadd mm1, mm3 ;b | a pmov mm2, [r0+r3*2] ;fi2 pupldq mm3, [r1+r3*2] ;gi2 | - pmov mm4, [r0] ;fi0 pupldq mm5, [r1] ;gi0 | - pupldq mm2, mm0 ;c | fi2 puphdq mm3, mm0 ;d | gi2 pupldq mm4, mm1 ;a | fi0 puphdq mm5, mm1 ;b | gi0 pfpnacc mm2, mm2 ;f2 | f3 pfpnacc mm3, mm3 ;g2 | g3 pfpnacc mm4, mm4 ;f0 | f1 pfpnacc mm5, mm5 ;g0 | g1 pmov mm0, mm2 pmov mm1, mm3 pupldq mm2, mm2 ;f3 | f3 pupldq mm3, mm3 ;g3 | g3 puphdq mm0, mm0 ;f2 | f2 puphdq mm1, mm1 ;g2 | g2 pswapd mm4, mm4 ;f1 | f0 pswapd mm5, mm5 ;g1 | g0 pfmul mm0, mm7 ;f2 *-s1 | f2 * c1 pfmul mm3, mm6 ;g3 * c1 | g3 * s1 pfmul mm1, mm6 ;g2 * c1 | g2 * s1 pfmul mm2, mm7 ;f3 *-s1 | f3 * c1 pfadd mm0, mm3 ;-b | a pfadd mm1, mm2 ; d | c pmov mm2, mm5 pmov mm3, mm4 pupldq mm4, mm0 ; a | f0 pupldq mm5, mm1 ; c | g0 puphdq mm2, mm0 ;-b | g1 puphdq mm3, mm1 ; d | f1 pfpnacc mm4, mm4 ;fi0 | fi2 pfpnacc mm5, mm5 ;gi0 | gi2 pfpnacc mm2, mm2 ;gi3 | gi1 pfpnacc mm3, mm3 ;fi1 | fi3 pmovd [r0+r3*2], mm4 ;fi[k2] puphdq mm4, mm4 pmovd [r1+r3*2], mm5 ;gi[k2] puphdq mm5, mm5 pmovd [r1+r3], mm2 ;gi[k1] puphdq mm2, mm2 pmovd [r0+r2], mm3 ;fi[k3] puphdq mm3, mm3 pmovd [r0], mm4 ;fi[0] pmovd [r1], mm5 ;gi[0] pmovd [r1+r2], mm2 ;gi[k3] pmovd [r0+r3], mm3 ;fi[k1] lea r0, [r0+r3*4] lea r1, [r1+r3*4] cmp r0, r6 jb near .do3 add r5, 4 cmp r5, r4 jb near .for cmp r3, [sp(%$n)] jae .exit add dword [sp(%$k)], 2 ;k += 2; lea r3, [r3*4] ;k1 *= 4 lea r2, [r2*4] ;k3 *= 4 lea r4, [r4*4] ;kx *= 4 mov r0, [sp(%$fz)] ;fi lea r1, [r0+r4] ;gi = fi + kx jmp .do.exit: femms popd ebp, ebx, esi, ediendproc%endif;***********************************************************************;void fft_side_3DN(float in[2][1024], int s, float *ret); /* s = MSFREQ は4の擒眶を簿年。侯喇箕は20だった */proc fft_side_3DN%$in arg 4%$s arg 4%$pret arg 4 femms pushd ebx, esi.for_init: mov r0, [sp(%$in)] ;r0 = &in[0][0] lea r1, [r0+fsizen(1024)] ;r1 = &in[1][0] mov r2, [sp(%$s)] mov r3, fsizen(1023) shl r2, 2 ;r2 = s * fsize sub r3, r2 ;r3 = (1023-s) * fsize mov r4, fsizen(512) ;r4 = 512 * fsize pxor mm7, mm7 jmp .for align 16.for: pmov mm0, [r0+r3] pmov mm1, [r1+r3] pmov mm2, [r0+r2] pfsub mm0, mm1 pmov mm3, [r1+r2] pfsub mm2, mm3 pmov mm4, [r0+r3-fsizen(2)] pfmul mm0, mm0 pmov mm5, [r1+r3-fsizen(2)] pfmul mm2, mm2 pmov mm1, [r0+r2+fsizen(2)] pupldq mm6, mm0 pmov mm3, [r1+r2+fsizen(2)] pfadd mm7, mm2 pfsub mm4, mm5 puphdq mm0, mm6 pfsub mm1, mm3 pfadd mm7, mm0 pfmul mm4, mm4 add r2, fsizen(4) pfmul mm1, mm1 sub r3, fsizen(4) pupldq mm6, mm4 pfadd mm7, mm1 cmp r2, r4 puphdq mm4, mm6 pfadd mm7, mm4 jb .for pmov mm0, [r0+r2] pmov mm1, [r1+r2] pmov mm2, [D_0_25] pfsub mm0, mm1 pmov mm3, [D_0_5] pfacc mm7, mm7 pfmul mm0, mm0 pfmul mm7, mm2 pfmul mm0, mm3 mov r0, [sp(%$pret)] pfadd mm0, mm7 pmovd [r0], mm0.exit: femms popd ebx, esiendproc;***********************************************************************%ifdef USE_E3DN;void fft_side_E3DN(float in[2][1024], int s, float *ret); /* s = MSFREQ は4の擒眶を簿年。侯喇箕は20だった */proc fft_side_E3DN%$in arg 4%$s arg 4%$pret arg 4 femms pushd ebx, esi.for_init: mov r0, [sp(%$in)] ;r0 = &in[0][0] lea r1, [r0+fsizen(1024)] ;r1 = &in[1][0] mov r2, [sp(%$s)] mov r3, fsizen(1023) shl r2, 2 ;r2 = s * fsize sub r3, r2 ;r3 = (1023-s) * fsize mov r4, fsizen(512) ;r4 = 512 * fsize pxor mm7, mm7 jmp .for align 16.for: pmov mm0, [r0+r3] pfsub mm0, [r1+r3] pmov mm4, [r0+r3-fsizen(2)] pfsub mm4, [r1+r3-fsizen(2)] pmov mm2, [r0+r2] pfsub mm2, [r1+r2] pmov mm1, [r0+r2+fsizen(2)] pfsub mm1, [r1+r2+fsizen(2)] pfmul mm0, mm0 pfmul mm4, mm4 pfmul mm2, mm2 pfmul mm1, mm1 pswapd mm0, mm0 pswapd mm4, mm4 pfadd mm7, mm2 add r2, fsizen(4) pfadd mm7, mm1 sub r3, fsizen(4) pfadd mm7, mm0 cmp r2, r4 pfadd mm7, mm4 jb .for pmov mm0, [r0+r2] pfsub mm0, [r1+r2] pfacc mm7, mm7 pfmul mm0, mm0 pfmul mm7, [D_0_25] pfmul mm0, [D_0_5] mov r0, [sp(%$pret)] pfadd mm0, mm7 pmovd [r0], mm0.exit: femms popd ebx, esiendproc%endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -