📄 cdctasm.asm
字号:
;/*
; * DCT functions ASM version for the XingMPG Decoder
; *
; * Copyright (c) 1999, Jean-Michel HERVE
; *
; * Code : TuO / StG
; * Date : 08/04/1999
; *
; * WARNING : only _fdct32_* has been tested. The other ones are made from
; * this function, but hasn't been tested at all. Should check it.
; * NOTE : I don't think that forward_bf and back_bf macros can be more
; * optimized (except maybe in changing algo)
; *
; * This program is free software; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; * $Id: cdctasm.asm,v 1.4 2000/10/13 14:29:02 ijr Exp $
; */
BITS 32
SECTION .data USE32
GLOBAL _forward_bf_asm
GLOBAL _back_bf_asm
;/* fdct function exportation */
GLOBAL _fdct32_asm
GLOBAL _fdct32_dual_asm
;EXTERN _coef32
ValC dd 0
Val4N dd 0
ValN dd 0
ValN2 dd 0
ValN21 dd 0
ValX dd 0
ValF dd 0
ValCoef dd 0
ValP0 dd 0
;/* 32 floats tables */
tab_a dd 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
tab_b dd 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
SECTION .text USE32
;/*
; * forward_bf ASM Version
; */
_forward_bf_asm: ;// PROC m, n, x[], f[], coef[] : DWORD
;// ebp ebp+4 ebp+8 ebp+12 ebp+16
push ebp
lea ebp,[esp + 8]
pushad
;/* Initialization part */
mov eax,[ebp + 4]
mov [ValN], eax
shr eax, 1
mov [ValN2], eax ;/* n2 is eax */
mov esi, [ebp + 8] ;/* x[0] is esi */
mov ebx, [ebp + 12] ;/* f[0] is ebx */
lea edx, [ebx + eax * 4] ;/* f[n2] is edx */
mov edi, [ebp + 16] ;/* coef[] is edi */
mov [ValCoef], edi
mov dword [ValP0], 0
mov ecx, [ebp] ;/* m-controled loop */
.BigLoop:
push ecx
mov edi, [ValCoef] ;/* coef is edi */
mov ecx, [ValP0] ;/* p is ecx */
mov ebp, ecx
add ebp, [ValN2]
dec ebp ;/* q-n2 is ebp */
lea ebp, [esi + ebp * 4] ;/* x[q-n2] is ebp */
;/* This way we can use the loop */
;/* counter to index the table */
;/* it save 1 dec per loop */
mov eax, [ValN2] ;/* n2-controled loop */
.SmallLoop:
fld dword [esi + ecx * 4] ;/* Load x[p] */
fld st0 ;/* Duplicate x[p] */
fld dword [ebp + eax * 4] ;/* Load x[q] */
fld st0 ;/* Duplicate x[q] */
faddp st2, st0 ;/* x[p]+x[q] */
fxch st0, st1 ;/* put value to the top */
fstp dword [ebx + ecx * 4] ;/* Store value */
fsubp st1, st0 ;/* x[p]-x[q] */
fmul dword [edi] ;/* coef[k] * (x[p]-x[q]) */
fstp dword [edx + ecx * 4]
add edi, 4 ;/* Update coef[k] */
inc ecx ;/* Update p */
dec eax ;/* Update loop counter */
jnz .SmallLoop
mov eax, [ValN]
add [ValP0], eax ;/* Update p0 */
pop ecx
dec ecx
jnz .BigLoop
popad
pop ebp
ret
;/*
; * back_bf ASM Version
; */
_back_bf_asm: ;// PROC m, n, x[], f[], : DWORD
;// ebp ebp+4 ebp+8 ebp+12
push ebp
lea ebp, [esp + 8]
pushad
mov edi, [ebp + 8] ;/* x[0] is esi */
mov esi, [ebp + 12] ;/* f[0] is esi */
mov eax, [ebp + 4]
mov [ValN], eax
shl eax, 2
mov [Val4N], eax
shr eax, 3
mov [ValN2], eax
lea edx,[edi + eax * 4]
dec eax
mov [ValN21], eax
mov ecx, [ebp]
mov ebp, edx
.BigLoop:
xor ebx, ebx
mov eax, [ValN2]
.SmallLoop1:
mov edx,[edi + ebx * 4]
mov [esi + ebx * 8], edx ;/* f[p]=x[q] */
inc ebx
dec eax
jnz .SmallLoop1
xor ebx, ebx
mov eax, [ValN21] ;/* n/2 -1 */
.SmallLoop2:
fld dword [ebp + ebx * 4] ;/* x[q] */
fadd dword [ebp + ebx * 4 + 4] ;/* x[q]+x[q+1] */
fstp dword [esi + ebx * 8 + 4] ;/* f[p]=x[q]+x[q+1] */
inc ebx
dec eax
jnz .SmallLoop2
mov edx, [ebp + ebx * 4]
mov [esi + ebx * 8 + 4], edx ;/* f[p]=x[q] */
mov eax, [Val4N]
add edi, eax ;/* Update p0 (edi) */
add ebp, eax ;/* Update p0 (edi) */
add esi, eax ;/* Update p0 (esi) */
dec ecx
jnz .BigLoop
popad
pop ebp
ret
;/*
; * forward_bf macro ASM Version
; */
%macro forward_bf_macro 5 ;// PROC m, n, x[], f[], coef[] : DWORD
;// ebp ebp+4 ebp+8 ebp+12 ebp+16
;/* Initialization part */
mov ebx, %4 ;/* f[0] is ebx */
mov esi, %3 ;/* x[0] is esi */
lea edx, [ebx + ((%2) / 2) * 4] ;/* f[n2] is edx */
xor edi, edi ;/* edi is p0 */
mov ecx, %1 ;/* m-controled loop */
%%BigLoop:
;/* Hiho, unrolled loop */
%assign i 0
%assign j %2-1
%rep (%2)/2
fld dword [esi + edi * 4 + i * 4] ;/* Load x[p] */
fld st0 ;/* Duplicate x[p] */
fld dword [esi + edi * 4 + j * 4] ;/* Load x[q] */
fadd st1, st0 ;/* x[p]+x[q] */
fxch st1 ;/* put value to the top */
fstp dword [ebx + edi * 4 + i * 4] ;/* Store value */
fsubp st1, st0 ;/* x[p]-x[q] */
fmul dword [%5 + i * 4] ;/* coef[k] * (x[p]-x[q]) */
fstp dword [edx + edi * 4 + i * 4]
%assign i i+1
%assign j j-1
%endrep
add edi, %2 ;/* Update p0 */
dec ecx
jnz near %%BigLoop
%endmacro
;/*
; * back_bf macro ASM Version
; */
%macro back_bf_macro 4 ;// PROC m, n, x[], f[], : DWORD
;// ebp ebp+4 ebp+8 ebp+12
mov edi, %3 ;/* x[0] is esi */
mov esi, %4 ;/* f[0] is esi */
mov ebp, %3 + ((%2 / 2) * 4) ;/* x[n2] is ebp */
xor eax, eax ;/* p0 is eax */
;/* Main loop counter initialization (if needed!) */
%if (%1)>1
mov ecx, %1 ;/* Avoid 1 step loop */
%%BigLoop:
%endif
;/* Unrolled loop */
%assign j 0
%rep (%2)/2
mov edx, [edi + eax + j * 4]
mov [esi + eax + j * 8], edx ;/* f[p]=x[q] */
%assign j j+1
%endrep
;/* Another unrolled loop */
%assign j 0
%rep ((%2)/2)-1
fld dword [ebp + eax + j * 4] ;/* x[q] */
fadd dword [ebp + eax + j * 4 + 4] ;/* x[q]+x[q+1] */
fstp dword [esi + eax + j * 8 + 4] ;/* f[p]=x[q]+x[q+1] */
%assign j j+1
%endrep
mov edx, [ebp + eax + j * 4]
mov [esi + eax + j * 8 + 4], edx ;/* f[p]=x[q] */
add eax, %2 * 4 ;/* Update p0 */
;/* Loop */
%if (%1)>1 ;/* Avoid 1 step loop */
dec ecx
jnz near %%BigLoop
%endif
%endmacro
;/* Special case of fdct32 and fdct32_dual functions */
%macro fdct32_specialcase 1
mov edi,[ebp+4] ;/* x[0] is edi */
;/* Unrolled loop */
%assign p 0
%assign q 31
%rep 16
fld dword [edi + p * 4 * %1] ;/* Load x[p] */
fld st0
fld dword [edi + q * 4 * %1] ;/* Load x[q] */
fadd st1, st0 ;/* x[p]+x[q] */
fxch st1 ;/* put value to the top */
fstp dword [tab_a + p * 4] ;/* Store value */
fsubp st1, st0 ;/* x[p]-x[q] */
fmul dword [eax + p * 4] ;/* coef32[p] * (x[p]-x[q]) */
fstp dword [tab_a + p * 4 + 16 * 4]
%assign p p+1
%assign q q-1
%endrep
%endmacro
;/*
; * fdct32 ASM Version
; */
_fdct32_asm: ;// PROC coef32, x[], c[] : DWORD
;// ebp ebp+4 ebp+8
push ebp
lea ebp, [esp + 8]
pushad
mov eax, [ebp + 8]
mov [ValC], eax
mov eax, [ebp]
;/* Special first stage for single chan */
fdct32_specialcase 1
forward_bf_macro 2 , 16, tab_a, tab_b, (eax + (16) * 4)
forward_bf_macro 4 , 8 , tab_b, tab_a, (eax + (16 + 8) * 4)
forward_bf_macro 8 , 4 , tab_a, tab_b, (eax + (16 + 8 + 4) * 4)
forward_bf_macro 16, 2 , tab_b, tab_a, (eax + (16 + 8 + 4 + 2) * 4)
back_bf_macro 8 , 4 , tab_a, tab_b
back_bf_macro 4 , 8 , tab_b, tab_a
back_bf_macro 2 , 16, tab_a, tab_b
back_bf_macro 1 , 32, tab_b, [ValC]
popad
pop ebp
ret
;/*
; * fdct32_dual ASM Version
; */
_fdct32_dual_asm: ;// PROC m, x[], c[] : DWORD
;// ebp ebp+4 ebp+8
push ebp
lea ebp, [esp + 8]
pushad
mov eax, [ebp + 8]
mov [ValC], eax
mov eax, [ebp]
;/* Special first stage for dual chan (interleaved x) */
fdct32_specialcase 2
forward_bf_macro 2 , 16, tab_a, tab_b, (eax + (16) * 4)
forward_bf_macro 4 , 8 , tab_b, tab_a, (eax + (16 + 8) * 4)
forward_bf_macro 8 , 4 , tab_a, tab_b, (eax + (16 + 8 + 4) * 4)
forward_bf_macro 16, 2 , tab_b, tab_a, (eax + (16 + 8 + 4 + 2) * 4)
back_bf_macro 8 , 4 , tab_a, tab_b
back_bf_macro 4 , 8 , tab_b, tab_a
back_bf_macro 2 , 16, tab_a, tab_b
back_bf_macro 1 , 32, tab_b, [ValC]
popad
pop ebp
ret
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -