📄 mdctasm.asm
字号:
;/*
; * mdct.c ASM version for the XingMPG Decoder
; * Include hybrid and FreqInvert functions
; *
; * Copyright (c) 1999, Jean-Michel HERVE
; *
; * This is hardcore coding, really! But should work pretty fine. If you find
; * a bug mmmmh... tell me where :)
; *
; * Code : TuO / StG
; * Date : 05/04/99
; *
; * This program is free software; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; * $Id: mdctasm.asm,v 1.3 1999/04/22 08:24:01 mhw Exp $
; */
BITS 32
SECTION .data USE32
GLOBAL _imdct18_asm
GLOBAL _imdct6_3_asm
GLOBAL _hybrid_asm
GLOBAL _FreqInvert_asm
EXTERN _mdct18w
EXTERN _mdct18w2
EXTERN _mdct6_3v
EXTERN _mdct6_3v2
EXTERN _coef
EXTERN _coef87
EXTERN _win
EXTERN _band_limit_nsb
WINSIZE equ 144
;/* temp table */
tab_a dd 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
tab_b dd 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
My_FPU_Half dd 0.5
SaveReturn dd 0
SECTION .text USE32
;/*
; * imdct18 ASM Version
; */
_imdct18_asm: ;// PROC f[]
;// ebp
push ebp
lea ebp,[esp+8]
pushad
mov ebp,[ebp] ;/* Save arg */
mov edi,_mdct18w
mov esi,_mdct18w2
mov eax,tab_a
mov ebx,tab_b
mov ecx,_coef
;/* Unroll the loop because it is a known-value count */
%assign p 0 ;/* Use compiler var as counter */
%rep 4
;/* First part - compute ap and bp */
fld dword [edi+p*4] ;/* w[p] */
fmul dword [ebp+p*4] ;/* g1 */
fld dword [edi+(17-p)*4] ;/* w[17-p] g1 */
fmul dword [ebp+(17-p)*4] ;/* g2 g1 */
fld st0 ;/* g2 g2 g1 */
fadd st2 ;/* ap g2 g1 */
fxch st1 ;/* g2 ap g1 */
fsubp st2,st0 ;/* ap g1-g2 */
fxch st1 ;/* g1-g2 ap */
fmul dword [esi+p*4] ;/* bp ap */
;/* Second part - compute a8p and b8p */
fld dword [edi+(8-p)*4] ;/* w[8-p] bp ap */
fmul dword [ebp+(8-p)*4] ;/* g1 bp ap */
fld dword [edi+(9+p)*4] ;/* w[9+p] g1 bp ap */
fmul dword [ebp+(9+p)*4] ;/* g2 g1 bp ap */
fld st0 ;/* g2 g2 g1 bp ap */
fadd st2 ;/* a8p g2 g1 bp ap */
fxch st1 ;/* g2 a8p g1 bp ap */
fsubp st2,st0 ;/* a8p g1-g2 bp ap */
fxch st1 ;/* g1-g2 a8p bp ap */
fmul dword [esi+(8-p)*4] ;/* b8p a8p bp ap */
;/* Final part - finalize computing and store values */
fld st3 ;/* ap b8p a8p bp ap */
fadd st2 ;/* ap+a8p b8p a8p bp ap */
fstp dword [eax+p*4] ;/* b8p a8p bp ap */
fxch st1 ;/* a8p b8p bp ap */
fsubp st3,st0 ;/* b8p bp ap-a8p */
fxch st2 ;/* ap-a8p bp b8p */
fstp dword [eax+(p+5)*4] ;/* bp b8p */
fxch st1 ;/* b8p bp */
fld st0 ;/* b8p b8p bp */
fadd st2 ;/* b8p+bp b8p bp */
fstp dword [ebx+p*4] ;/* b8p bp */
fsubp st1,st0 ;/* bp-b8p */
fstp dword [ebx+(p+5)*4] ;/* - */
%assign p p+1
%endrep
%assign p 4
;/* Last - finalize array */
fld dword [edi+p*4] ;/* w[p] */
fmul dword [ebp+p*4] ;/* g1 */
fld dword [edi+(17-p)*4] ;/* w[17-p] g1 */
fmul dword [ebp+(17-p)*4] ;/* g2 g1 */
fld st0 ;/* g2 g2 g1 */
fadd st2 ;/* ap g2 g1 */
fstp dword [eax+p*4] ;/* g2 g1 */
fsubp st1,st0 ;/* g1-g2 */
fmul dword [esi+p*4] ;/* bp */
fstp dword [ebx+p*4] ;/* - */
;/* Now the huge and boring part */
;// f[0] = 0.5f * (a[0] + a[1] + a[2] + a[3] + a[4]);
;// TO DO : avoid reload of a[4] and b[4]
fld dword [eax+0*4] ;/* a[0] */
fadd dword [eax+1*4] ;/* a[0]+a[1] */
fadd dword [eax+2*4] ;/* a[0]+a[1]+a[2] */
fadd dword [eax+3*4] ;/* a[0]+a[1]+a[2]+a[3] */
fadd dword [eax+4*4] ;/* a[0]+a[1]+a[2]+a[3]+a[4] */
fmul dword [My_FPU_Half] ;/* 0.5*(a[0]+a[1]+a[2]+a[3]+a[4]) */
fst dword [ebp+0*4] ;/* f[0] */
;// f[1] = 0.5f * (b[0] + b[1] + b[2] + b[3] + b[4]);
fld dword [ebx+0*4] ;/* b[0] f[0] */
fadd dword [ebx+1*4] ;/* b[0]+b[1] f[0] */
fadd dword [ebx+2*4] ;/* b[0]+b[1]+b[2] f[0] */
fadd dword [ebx+3*4] ;/* b[0]+b[1]+b[2]+b[3] f[0] */
fadd dword [ebx+4*4] ;/* b[0]+b[1]+b[2]+b[3]+b[4] f[0] */
fmul dword [My_FPU_Half] ;/* f[1]' f[0] */
;// f[2] = coef[1][0]*a[5]+coef[1][1]*a[6]+coef[1][2]*a[7]+coef[1][3]*a[8];
fld dword [ecx+1*16+0*4] ;/* coef[1][0] f[1]' f[0] */
fld dword [ecx+1*16+1*4] ;/* coef[1][1] coef[1][0] f[1]' f[0] */
fld dword [ecx+1*16+2*4] ;/* coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fld dword [ecx+1*16+3*4] ;/* coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fld st3 ;/* coef[1][0] coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fmul dword [eax+5*4] ;/* v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fld st3 ;/* coef[1][1] v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fmul dword [eax+6*4] ;/* v1 v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
faddp st1,st0 ;/* v1+v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fld st2 ;/* coef[1][2] v1+v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fmul dword [eax+7*4] ;/* v2 v1+v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
faddp st1,st0 ;/* v2+v1+v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fld st1 ;/* coef[1][3] v2+v1+v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
fmul dword [eax+8*4] ;/* v3 v2+v1+v0 coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
faddp st1,st0 ;/* f[2]' coef[1][3] coef[1][2] coef[1][1] coef[1][0] f[1]' f[0] */
;// f[3] = coef[1][0]*b[5]+coef[1][1]*b[6]+coef[1][2]*b[7]+coef[1][3]*b[8]-f[1];
fxch st4 ;/* coef[1][0] coef[1][3] coef[1][2] coef[1][1] f[2]' f[1]' f[0] */
fmul dword [ebx+5*4] ;/* v0 coef[1][3] coef[1][2] coef[1][1] f[2]' f[1]' f[0] */
fxch st3 ;/* coef[1][1] coef[1][3] coef[1][2] v0 f[2]' f[1]' f[0] */
fmul dword [ebx+6*4] ;/* v1 coef[1][3] coef[1][2] v0 f[2]' f[1]' f[0] */
faddp st3,st0 ;/* coef[1][3] coef[1][2] v0+v1 f[2]' f[1]' f[0] */
fxch st1 ;/* coef[1][2] coef[1][3] v0+v1 f[2]' f[1]' f[0] */
fmul dword [ebx+7*4] ;/* v2 coef[1][3] v0+v1 f[2]' f[1]' f[0] */
faddp st2,st0 ;/* coef[1][3] v0+v1+v2 f[2]' f[1]' f[0] */
fmul dword [ebx+8*4] ;/* v3 v0+v1+v2 f[2]' f[1]' f[0] */
faddp st1,st0 ;/* v0+v1+v2+v3 f[2]' f[1]' f[0] */
fsub st0,st2 ;/* f[3]' f[2]' f[1]' f[0] */
;// f[1] = f[1] - f[0];
fxch st3 ;/* f[0] f[2]' f[1]' f[3]' */
fsubp st2,st0 ;/* f[2]' f[1] f[3]' */
fxch st1 ;/* f[1] f[2]' f[3]' */
fst dword [ebp+1*4] ;/* f[1] f[2]' f[3]' */
;// f[2] = f[2] - f[1];
fsubp st1,st0 ;/* f[2] f[3]' */
fst dword [ebp+2*4] ;/* f[2] f[3]' */
fxch st1 ;/* f[3]' f[2] */
;// f[4]=coef[2][0]*a[0]+coef[2][1]*a[1]+coef[2][2]*a[2]+coef[2][3]*a[3]-a[4];
fld dword [ecx+2*16+0*4] ;/* c[2,0] f[3]' f[2] */
fld dword [ecx+2*16+1*4] ;/* c[2,1] c[2,0] f[3]' f[2] */
fld dword [ecx+2*16+2*4] ;/* c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fld dword [ecx+2*16+3*4] ;/* c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fld st3 ;/* c[2,0] c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fmul dword [eax+0*4] ;/* v0 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fld st3 ;/* c[2,1] v0 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fmul dword [eax+1*4] ;/* v1 v0 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
faddp st1,st0 ;/* v1+v0 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fld st2 ;/* c[2,2] v1+v0 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
fmul dword [eax+2*4] ;/* v2 v1+v0 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
faddp st1,st0 ;/* v0v1v2 c[2,3] c[2,2] c[2,1] c[2,0] f[3]' f[2] */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -