⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fftsse.nas

📁 lame 3.97源码.最好的mp3解码和压缩软件
💻 NAS
📖 第 1 页 / 共 2 页
字号:
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA; GOGO-no-coda;	Copyright (C) 1999 shigeo;	special thanks to Keiichi SAKAI%include "nasm.h"	globaldef fht_SSE	globaldef fft_side_SSE	externdef costab_fft	externdef sintab_fft	segment_data	align 16Q_MMPP	dd	0x0,0x0,0x80000000,0x80000000Q_MPMP	dd	0x0,0x80000000,0x0,0x80000000Q_002	dd	0.02236068, 0.02236068, 0.02236068, 0.02236068D_SQRT2	dd	1.414213562,1.414213562S_025	dd	0.25S_05	DD	0.5S_00005	DD	0.0005	segment_code;------------------------------------------------------------------------;	by K. SAKAI;	99/08/18	PIII 23k[clk];	99/08/19	炭吾界进掐れ垂え PIII 22k[clk];	99/08/20	bit reversal を奠羔稿から败竣した PIII 17k[clk];	99/08/23	办婶 unroll PIII 14k[clk];	99/11/12	clean up;;void fht_SSE(float *fz, int n);	align 16fht_SSE:	push	ebx	push	esi	push	edi	push	ebp%assign _P 4*4	;2つ誊のル〖プ	mov	eax,[esp+_P+4]	;eax=fz	mov	ebp,[esp+_P+8]	;=n	shl	ebp,2	add	ebp,eax		; fn  = fz + n, この簇眶姜位まで稍恃	xor	ecx,ecx		; ecx=k=0	xor	eax,eax	mov	al,4		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...	xor	edx,edx	mov	dl,12		; =k3=3*k1	jmp	short .lp2	align	16.lp2:				; do{	add	cl,2		; k  += 2;	shl	eax,2	shl	edx,2	mov	esi,[esp+_P+4]	;esi=fi=fz	mov	edi,eax	shr	edi,1	add	edi,esi		; edi=gi=fi+ki/2; たかだか2事误しか袋略できない婶尸はFPUのほうが庐い。	movss	xmm7,[D_SQRT2]	jmp	short .lp20	align	16.lp20:				; do{;                       f0     = fi[0 ] + fi[k1];;                       f2     = fi[k2] + fi[k3];;                       f1     = fi[0 ] - fi[k1];;                       f3     = fi[k2] - fi[k3];;                       fi[0 ] = f0     + f2;;                       fi[k1] = f1     + f3;;                       fi[k2] = f0     - f2;;                       fi[k3] = f1     - f3;	fld	dword [esi]	fadd	dword [esi+eax]	fld	dword [esi+eax*2]	fadd	dword [esi+edx]	fld	dword [esi]	fsub	dword [esi+eax]	fld	dword [esi+eax*2]	fsub	dword [esi+edx]	fld	st1	fadd	st0,st1	fstp	dword [esi+eax]	fsubp	st1,st0	fstp	dword [esi+edx]	fld	st1	fadd	st0,st1	fstp	dword [esi]	fsubp	st1,st0	fstp	dword [esi+eax*2]	lea	esi,[esi + eax*4]	; = fi += (k1 * 4);;	add	esi,eax;	add	esi,edx;                       g0     = gi[0 ] + gi[k1];;                       g2     = SQRT2  * gi[k2];;                       g1     = gi[0 ] - gi[k1];;                       g3     = SQRT2  * gi[k3];;                       gi[0 ] = g0     + g2;;                       gi[k2] = g0     - g2;;                       gi[k1] = g1     + g3;;                       gi[k3] = g1     - g3;	fld	dword [edi]	fadd	dword [edi+eax]	fld	dword [D_SQRT2]	fmul	dword [edi+eax*2]	fld	dword [edi]	fsub	dword [edi+eax]	fld	dword [D_SQRT2]	fmul	dword [edi+edx]	fld	st1	fadd	st0,st1	fstp	dword [edi+eax]	fsubp	st1,st0	fstp	dword [edi+edx]	fld	st1	fadd	st0,st1	fstp	dword [edi]	fsubp	st1,st0	fstp	dword [edi+eax*2]	lea	edi,[edi + eax*4]	; = gi += (k1 * 4);	cmp	esi,ebp	jl	near .lp20		; while (fi<fn);;               i = 1; //for (i=1;i<kx;i++){;                       c1 = 1.0*t_c - 0.0*t_s;;                       s1 = 0.0*t_c + 1.0*t_s;	movss	xmm6,[costab_fft + ecx*4]	movss	xmm1,[sintab_fft + ecx*4]	shufps	xmm6,xmm1,0x00	; = {s1, s1, c1, c1}	shufps	xmm6,xmm6,0x28	; = {+c1, +s1, +s1, +c1};                       c2 = c1*c1 - s1*s1;;                       s2 = c1*s1 + s1*c1;	movaps	xmm4,xmm6	movaps	xmm7,xmm6	unpcklps	xmm4,xmm4	; = {s1, s1, c1, c1}	shufps	xmm7,xmm7,0x41	mulps	xmm4,xmm6	; = {s1*c1, s1*s1, c1*s1, c1*c1}	xorps	xmm7,[Q_MMPP]	; = {-s1, -c1, +c1, +s1}	movhlps	xmm3,xmm4	xorps	xmm3,[Q_MPMP]	subps	xmm4,xmm3	; = {--, --, s2, c2}	movlhps	xmm4,xmm4	; = {+s2, +c2, +s2, +c2}	movaps	xmm5,xmm4	shufps	xmm5,xmm5,0x11	xorps	xmm5,[Q_MPMP]	; = {-c2, +s2, -c2, +s2}	mov	esi,[esp+_P+4]	; = fz	lea	edi,[esi + eax - 4]	; edi = gi = fz +k1-i	add	esi,4		; esi = fi = fz + i	jmp	short .lp21	align	16.lp21:				; do{;                               a       = c2*fi[k1] + s2*gi[k1];;                               b       = s2*fi[k1] - c2*gi[k1];;                               c       = c2*fi[k3] + s2*gi[k3];;                               d       = s2*fi[k3] - c2*gi[k3];;                               f0      = fi[0 ]        + a;;                               g0      = gi[0 ]        + b;;                               f2      = fi[k1 * 2]    + c;;                               g2      = gi[k1 * 2]    + d;;                               f1      = fi[0 ]        - a;;                               g1      = gi[0 ]        - b;;                               f3      = fi[k1 * 2]    - c;;                               g3      = gi[k1 * 2]    - d;	movss	xmm0,[esi + eax]	; = fi[k1]	movss	xmm2,[esi + edx]	; = fi[k3]	shufps	xmm0,xmm2,0x00	; = {fi[k3], fi[k3], fi[k1], fi[k1]}	movss	xmm1,[edi + eax]	; = fi[k1]	movss	xmm3,[edi + edx]	; = fi[k3]	shufps	xmm1,xmm3,0x00	; = {gi[k3], gi[k3], gi[k1], gi[k1]}	movss	xmm2,[esi]		; = fi[0]	mulps	xmm0,xmm4		; *= {+s2, +c2, +s2, +c2}	movss	xmm3,[esi + eax*2]	; = fi[k2]	unpcklps	xmm2,xmm3	; = {--, --, fi[k2], fi[0]}	mulps	xmm1,xmm5		; *= {-c2, +s2, -c2, +s2}	movss	xmm3,[edi + eax*2]	; = gi[k2]	addps	xmm0,xmm1		; = {d, c, b, a}	movss	xmm1,[edi]		; = gi[0]	unpcklps	xmm1,xmm3	; = {--,  --, gi[k2], gi[0]}	unpcklps	xmm2,xmm1	; = {gi[k2], fi[k2], gi[0], fi[0]}	movaps	xmm1,xmm2	addps	xmm1,xmm0	; = {g2, f2, g0, f0}	subps	xmm2,xmm0	; = {g3, f3, g1, f1};                               a       = c1*f2     + s1*g3;;                               c       = s1*g2     + c1*f3;;                               b       = s1*f2     - c1*g3;;                               d       = c1*g2     - s1*f3;;                               fi[0 ]  = f0        + a;;                               gi[0 ]  = g0        + c;;                               gi[k1]  = g1        + b;;                               fi[k1]  = f1        + d;;                               fi[k1 * 2]  = f0    - a;;                               gi[k1 * 2]  = g0    - c;;                               gi[k3]      = g1    - b;;                               fi[k3]      = f1    - d;	movaps	xmm3,xmm1	movhlps	xmm1,xmm1	; = {g2, f2, g2, f2}	shufps	xmm3,xmm2,0x14	; = {f1, g1, g0, f0}	mulps	xmm1,xmm6	; *= {+c1, +s1, +s1, +c1}	shufps	xmm2,xmm2,0xBB	; = {f3, g3, f3, g3}	mulps	xmm2,xmm7	; *= {-s1, -c1, +c1, +s1}	addps	xmm1,xmm2	; = {d, b, c, a}	movaps	xmm2,xmm3	addps	xmm3,xmm1	; = {fi[k1], gi[k1], gi[0], fi[0]}	subps	xmm2,xmm1	; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}	movhlps	xmm0,xmm3	movss	[esi],xmm3	shufps	xmm3,xmm3,0x55	movss	[edi+eax],xmm0	shufps	xmm0,xmm0,0x55	movss	[edi],xmm3	movss	[esi+eax],xmm0	movhlps	xmm0,xmm2	movss	[esi+eax*2],xmm2	shufps	xmm2,xmm2,0x55	movss	[edi+edx],xmm0	shufps	xmm0,xmm0,0x55	movss	[edi+eax*2],xmm2	lea	edi,[edi + eax*4] ; gi += (k1 * 4);	movss	[esi+edx],xmm0	lea	esi,[esi + eax*4] ; fi += (k1 * 4);	cmp	esi,ebp	jl	near .lp21		; while (fi<fn);; unroll涟のdo loopは43+4炭吾; 呵柒件ではないforル〖プをunrollingした; kx=   2,   8,  32,  128; k4=  16,  64, 256, 1024;       0, 6/2,30/2,126/2; at here;	xmm6 = {--, --, s1, c1};               c3 = c1; s3 = s1;	xor	ebx,ebx	mov	bl,4*4		; = i = 4	cmp	ebx,eax		; i < k1	jnl	near .F22	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}	jmp	short .F220	align	16;               for (i=4;i<k1;i+=4){ // for (i=2;i<k1/2;i+=2){.lp22:	shufps	xmm6,xmm6,0x69	; xmm6 = {c3, s3, s3, c3}.F220:; at here, xmm6 is {c3, s3, s3, c3};                       c1 = c3*t_c - s3*t_s;;                       s1 = c3*t_s + s3*t_c;	movss	xmm0,[costab_fft + ecx*4]	movss	xmm1,[sintab_fft + ecx*4]	shufps	xmm0,xmm1,0x00	; = {t_s, t_s, t_c, t_c}	mulps	xmm6,xmm0	movhlps	xmm4,xmm6

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -