📄 quantize_h263_3dne.asm
字号:
;/**************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - 3dne Quantization/Dequantization -
; *
; * Copyright(C) 2002-2003 Jaan Kalda
; *
; * This program is free software ; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation ; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program ; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; * $Id: quantize_h263_3dne.asm,v 1.1 2005/07/21 09:09:03 klschoef Exp $
; *
; *************************************************************************/
;
; these 3dne functions are compatible with iSSE, but are optimized specifically for
; K7 pipelines
; enable dequant saturate [-2048,2047], test purposes only.
%define SATURATE
BITS 32
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
;=============================================================================
; Local data
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata data align=16
%endif
align 4
int_div:
dd 0
%assign i 1
%rep 255
dd (1 << 16) / (i) + 1
%assign i i+1
%endrep
ALIGN 16
plus_one:
times 8 dw 1
;-----------------------------------------------------------------------------
; subtract by Q/2 table
;-----------------------------------------------------------------------------
ALIGN 16
mmx_sub:
%assign i 1
%rep 31
times 4 dw i / 2
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
;
; divide by 2Q table
;
; use a shift of 16 to take full advantage of _pmulhw_
; for q=1, _pmulhw_ will overflow so it is treated seperately
; (3dnow2 provides _pmulhuw_ which wont cause overflow)
;
;-----------------------------------------------------------------------------
ALIGN 16
mmx_div:
%assign i 1
%rep 31
times 4 dw (1 << 16) / (i * 2) + 1
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
; add by (odd(Q) ? Q : Q - 1) table
;-----------------------------------------------------------------------------
ALIGN 16
mmx_add:
%assign i 1
%rep 31
%if i % 2 != 0
times 4 dw i
%else
times 4 dw i - 1
%endif
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
; multiple by 2Q table
;-----------------------------------------------------------------------------
ALIGN 16
mmx_mul:
%assign i 1
%rep 31
times 4 dw i * 2
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
; saturation limits
;-----------------------------------------------------------------------------
ALIGN 8
mmx_32768_minus_2048:
times 4 dw (32768-2048)
mmx_32767_minus_2047:
times 4 dw (32767-2047)
ALIGN 16
mmx_2047:
times 4 dw 2047
ALIGN 8
mmzero:
dd 0, 0
int2047:
dd 2047
int_2048:
dd -2048
;=============================================================================
; Code
;=============================================================================
SECTION .text
;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_intra_3dne(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
;This is Athlon-optimized code (ca 70 clk per call)
%macro quant_intra1 1
psubw mm1, mm0 ;A3
psubw mm3, mm2 ;B3
%if (%1)
psubw mm5, mm4 ;C8
psubw mm7, mm6 ;D8
%endif
ALIGN 8
db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
pmaxsw mm1, mm0 ;A4
db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
pmaxsw mm3, mm2 ;B4
psraw mm0, 15 ;A5
psraw mm2, 15 ;B5
%if (%1)
movq [edx + %1 * 32 + 16-32], mm5 ;C9
movq [edx + %1 * 32 + 24-32], mm7 ;D9
%endif
psrlw mm1, 1 ;A6
psrlw mm3, 1 ;B6
movq mm5, [ebx] ;C2
movq mm7, [ebx] ;D2
pxor mm1, mm0 ;A7
pxor mm3, mm2 ;B7
psubw mm5, mm4 ;C3
psubw mm7, mm6 ;D3
psubw mm1, mm0 ;A8
psubw mm3, mm2 ;B8
%if (%1 == 0)
push ebp
movq mm0, [ecx + %1 * 32 +32]
%elif (%1 < 3)
db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
%endif
pmaxsw mm5, mm4 ;C4
%if (%1 < 3)
db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
%else
cmp esp, esp
%endif
pmaxsw mm7, mm6 ;D4
psraw mm4, 15 ;C5
psraw mm6, 15 ;D5
movq [byte edx + %1 * 32], mm1 ;A9
movq [edx + %1 * 32+8], mm3 ;B9
psrlw mm5, 1 ;C6
psrlw mm7, 1 ;D6
%if (%1 < 3)
movq mm1, [ebx] ;A2
movq mm3, [ebx] ;B2
%endif
%if (%1 == 3)
imul eax, [int_div+4*edi]
%endif
pxor mm5, mm4 ;C7
pxor mm7, mm6 ;D7
%endm
%macro quant_intra 1
; Rules for athlon:
; 1) schedule latencies
; 2) add/mul and load/store in 2:1 proportion
; 3) avoid spliting >3byte instructions over 8byte boundaries
psubw mm1, mm0 ;A3
psubw mm3, mm2 ;B3
%if (%1)
psubw mm5, mm4 ;C8
psubw mm7, mm6 ;D8
%endif
ALIGN 8
db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
pmaxsw mm1, mm0 ;A4
db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
pmaxsw mm3, mm2 ;B4
psraw mm0, 15 ;A5
psraw mm2, 15 ;B5
%if (%1)
movq [edx + %1 * 32 + 16-32], mm5 ;C9
movq [edx + %1 * 32 + 24-32], mm7 ;D9
%endif
pmulhw mm1, [esi] ;A6
pmulhw mm3, [esi] ;B6
movq mm5, [ebx] ;C2
movq mm7, [ebx] ;D2
nop
nop
pxor mm1, mm0 ;A7
pxor mm3, mm2 ;B7
psubw mm5, mm4 ;C3
psubw mm7, mm6 ;D3
psubw mm1, mm0 ;A8
psubw mm3, mm2 ;B8
%if (%1 < 3)
db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
%endif
pmaxsw mm5, mm4 ;C4
%if (%1 < 3)
db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
%else
cmp esp, esp
%endif
pmaxsw mm7,mm6 ;D4
psraw mm4, 15 ;C5
psraw mm6, 15 ;D5
movq [byte edx + %1 * 32], mm1 ;A9
movq [edx + %1 * 32+8], mm3 ;B9
pmulhw mm5, [esi] ;C6
pmulhw mm7, [esi] ;D6
%if (%1 < 3)
movq mm1, [ebx] ;A2
movq mm3, [ebx] ;B2
%endif
%if (%1 == 0)
push ebp
%elif (%1 < 3)
nop
%endif
nop
%if (%1 == 3)
imul eax, [int_div+4*edi]
%endif
pxor mm5, mm4 ;C7
pxor mm7, mm6 ;D7
%endmacro
ALIGN 16
cglobal quant_h263_intra_3dne
quant_h263_intra_3dne:
mov eax, [esp + 12] ; quant
mov ecx, [esp + 8] ; data
mov edx, [esp + 4] ; coeff
cmp al, 1
pxor mm1, mm1
pxor mm3, mm3
movq mm0, [ecx] ; mm0 = [1st]
movq mm2, [ecx + 8]
push esi
lea esi, [mmx_div + eax*8 - 8]
push ebx
mov ebx, mmzero
push edi
jz near .q1loop
quant_intra 0
mov ebp, [esp + 16 + 16] ; dcscalar
; NB -- there are 3 pushes in the function preambule and one more
; in "quant_intra 0", thus an added offset of 16 bytes
movsx eax, word [byte ecx] ; DC
quant_intra 1
mov edi, eax
sar edi, 31 ; sign(DC)
shr ebp, byte 1 ; ebp = dcscalar/2
quant_intra 2
sub eax, edi ; DC (+1)
xor ebp, edi ; sign(DC) dcscalar /2 (-1)
mov edi, [esp + 16 + 16] ; dscalar
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2
mov ebp, [byte esp]
quant_intra 3
psubw mm5, mm4 ;C8
mov esi, [esp + 12] ; pop back the register value
mov edi, [esp + 4] ; pop back the register value
sar eax, 16
lea ebx, [byte eax + 1] ; workaround for eax < 0
cmovs eax, ebx ; conditionnaly move the corrected value
mov [edx], ax ; coeff[0] = ax
mov ebx, [esp + 8] ; pop back the register value
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
psubw mm7, mm6 ;D8
movq [edx + 3 * 32 + 16], mm5 ;C9
movq [edx + 3 * 32 + 24], mm7 ;D9
xor eax, eax
ret
ALIGN 16
.q1loop
quant_intra1 0
mov ebp, [esp + 16 + 16] ; dcscalar
movsx eax, word [byte ecx] ; DC
quant_intra1 1
mov edi, eax
sar edi, 31 ; sign(DC)
shr ebp, byte 1 ; ebp = dcscalar /2
quant_intra1 2
sub eax, edi ; DC (+1)
xor ebp, edi ; sign(DC) dcscalar /2 (-1)
mov edi, [esp + 16 + 16] ; dcscalar
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2
mov ebp, [byte esp]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -