📄 quantize_h263_3dne.asm
字号:
quant_intra1 3
psubw mm5, mm4 ;C8
mov esi, [dword esp + 12] ; pop back the register value
mov edi, [esp + 4] ; pop back the register value
sar eax, 16
lea ebx, [byte eax + 1] ; workaround for eax < 0
cmovs eax, ebx ; conditionnaly move the corrected value
mov [edx], ax ; coeff[0] = ax
mov ebx, [esp + 8] ; pop back the register value
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
psubw mm7, mm6 ;D8
movq [edx + 3 * 32 + 16], mm5 ;C9
movq [edx + 3 * 32 + 24], mm7 ;D9
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_inter_3dne(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
;This is Athlon-optimized code (ca 90 clk per call)
;Optimized by Jaan, 30 Nov 2002
%macro quantinter 1
movq mm1, [eax] ;A2
psraw mm3, 15 ;B6
%if (%1)
psubw mm2, mm6 ;C10
%endif
psubw mm1, mm0 ;A3
pmulhw mm4, mm7 ;B7
movq mm6, [ecx + %1*24+16] ;C1
pmaxsw mm1, mm0 ;A4
paddw mm5, mm4 ;B8
%if (%1)
movq [edx + %1*24+16-24], mm2 ;C11
%endif
psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0)
pxor mm4, mm3 ;B9
movq mm2, [eax] ;C2
psraw mm0, 15 ;A6
psubw mm4, mm3 ;B10
psubw mm2, mm6 ;C3
pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24
movq mm3, [ecx + %1*24+8] ;B1
pmaxsw mm2, mm6 ;C4
paddw mm5, mm1 ;A8 sum += mm0
%if (%1)
movq [edx + %1*24+8-24], mm4 ;B11
%else
movq [edx + 120], mm4 ;B11
%endif
psubusw mm2, [ebx] ;C5
pxor mm1, mm0 ;A9 mm0 *= sign(mm0)
movq mm4, [eax] ;B2
psraw mm6, 15 ;C6
psubw mm1, mm0 ;A10 undisplace
psubw mm4, mm3 ;B3
pmulhw mm2, mm7 ;C7
movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st]
pmaxsw mm4, mm3 ;B4
paddw mm5, mm2 ;C8
movq [byte edx + %1*24], mm1 ;A11
psubusw mm4, [ebx] ;B5
pxor mm2, mm6 ;C9
%endmacro
%macro quantinter1 1
movq mm0, [byte ecx + %1*16] ;mm0 = [1st]
movq mm3, [ecx + %1*16+8] ;
movq mm1, [eax]
movq mm4, [eax]
psubw mm1, mm0
psubw mm4, mm3
pmaxsw mm1, mm0
pmaxsw mm4, mm3
psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0)
psubusw mm4, mm6 ;
psraw mm0, 15
psraw mm3, 15
psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16
psrlw mm4, 1 ;
paddw mm5, mm1 ; sum += mm0
pxor mm1, mm0 ; mm0 *= sign(mm0)
paddw mm5, mm4
pxor mm4, mm3 ;
psubw mm1, mm0 ; undisplace
psubw mm4, mm3
cmp esp, esp
movq [byte edx + %1*16], mm1
movq [edx + %1*16+8], mm4
%endmacro
ALIGN 16
cglobal quant_h263_inter_3dne
quant_h263_inter_3dne:
mov edx, [esp + 4] ; coeff
mov ecx, [esp + 8] ; data
mov eax, [esp + 12] ; quant
push ebx
pxor mm5, mm5 ; sum
nop
lea ebx,[mmx_sub + eax * 8 - 8] ; sub
movq mm7, [mmx_div + eax * 8 - 8] ; divider
cmp al, 1
lea eax, [mmzero]
jz near .q1loop
cmp esp, esp
ALIGN 8
movq mm3, [ecx + 120] ;B1
pxor mm4, mm4 ;B2
psubw mm4, mm3 ;B3
movq mm0, [ecx] ;A1 mm0 = [1st]
pmaxsw mm4, mm3 ;B4
psubusw mm4, [ebx] ;B5
quantinter 0
quantinter 1
quantinter 2
quantinter 3
quantinter 4
psraw mm3, 15 ;B6
psubw mm2, mm6 ;C10
pmulhw mm4, mm7 ;B7
paddw mm5, mm4 ;B8
pxor mm4, mm3 ;B9
psubw mm4, mm3 ;B10
movq [edx + 4*24+16], mm2 ;C11
pop ebx
movq [edx + 4*24+8], mm4 ;B11
pmaddwd mm5, [plus_one]
movq mm0, mm5
punpckhdq mm5, mm5
paddd mm0, mm5
movd eax, mm0 ; return sum
ret
ALIGN 16
.q1loop
movq mm6, [byte ebx]
quantinter1 0
quantinter1 1
quantinter1 2
quantinter1 3
quantinter1 4
quantinter1 5
quantinter1 6
quantinter1 7
pmaddwd mm5, [plus_one]
movq mm0, mm5
psrlq mm5, 32
paddd mm0, mm5
movd eax, mm0 ; return sum
pop ebx
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_intra_3dne(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; this is the same as dequant_inter_3dne, except that we're
; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
;This is Athlon-optimized code (ca 106 clk per call)
%macro dequant 1
movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2
psubw mm0, mm1 ;-c ;A3 (1st dep)
%if (%1)
paddw mm4, mm6 ;C11 mm6 free (4th+)
%endif
pmaxsw mm0, mm1 ;|c| ;A4 (2nd)
%if (%1)
mov ebp, ebp
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later
%endif
movq mm6, [esi] ;0 ;A5 mm6 in use
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
%if (%1)
pxor mm5, mm4 ;C13 (6th+) 1later
%endif
movq mm4, [esi] ;C1 ;0
mov esp, esp
pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)
ALIGN 4
psraw mm1, 15 ; sign(c) ;A7 (2nd)
%if (%1)
movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
%endif
paddw mm7, mm3 ;B10 offset +negate back (3rd)
pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+)
paddw mm2, mm7 ;B11 mm7 free (4th+)
lea ebp, [byte ebp]
movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i]
psubw mm4, mm5 ;-c ;C3 (1st dep)
pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd)
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
pxor mm3, mm2 ;B13 (6th+)
movq mm2, [byte esi] ;B1 ;0
%if (%1)
movq [edx+%1*24+8-24], mm3 ;B14 (7th)
%else
movq [edx+120], mm3
%endif
pmaxsw mm4, mm5 ;|c| ;C4 (2nd)
paddw mm6, mm1 ;A10 offset +negate back (3rd)
movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i]
psubw mm2, mm3 ;-c ;B3 (1st dep)
paddw mm0, mm6 ;A11 mm6 free (4th+)
movq mm6, [byte esi] ;0 ;C5 mm6 in use
pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+)
pmaxsw mm2, mm3 ;|c| ;B4 (2nd)
pxor mm1, mm0 ;A13 (6th+)
pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+)
psraw mm5, 15 ; sign(c) ;C7 (2nd)
movq mm7, [byte esi] ;0 ;B5 mm7 in use
pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)
%if (%1 < 4)
movq mm0, [byte esi] ;A1 ;0
%endif
pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd)
psraw mm3, 15 ;sign(c) ;B7 (2nd)
movq [byte edx+%1*24], mm1 ;A14 (7th)
paddw mm6, mm5 ;C10 offset +negate back (3rd)
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
mov esp, esp
%endmacro
ALIGN 16
cglobal dequant_h263_intra_3dne
dequant_h263_intra_3dne:
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
pxor mm0, mm0
pxor mm2, mm2
push edi
push ebx
lea edi, [mmx_mul + eax*8 - 8] ; 2*quant
push ebp
mov ebx, mmx_2047
movsx ebp, word [ecx]
lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
push esi
mov esi, mmzero
pxor mm7, mm7
movq mm3, [ecx+120] ;B2 ; c = coeff[i]
pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
imul ebp, [esp+16+16] ; dcscalar
psubw mm2, mm3 ;-c ;B3 (1st dep)
pmaxsw mm2, mm3 ;|c| ;B4 (2nd)
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
psraw mm3, 15 ; sign(c) ;B7 (2nd)
mov edx, [esp+ 4+16] ; data
ALIGN 8
dequant 0
cmp ebp, -2048
mov esp, esp
dequant 1
cmovl ebp, [int_2048]
nop
dequant 2
cmp ebp, 2047
mov esp, esp
dequant 3
cmovg ebp, [int2047]
nop
dequant 4
paddw mm4, mm6 ;C11 mm6 free (4th+)
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
mov eax, ebp
mov esi, [esp]
mov ebp, [esp+4]
pxor mm5, mm4 ;C13 (6th+)
paddw mm7, mm3 ;B10 offset +negate back (3rd)
movq [edx+4*24+16], mm5 ;C14 (7th)
paddw mm2, mm7 ;B11 mm7 free (4th+)
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
mov ebx, [esp+8]
mov edi, [esp+12]
add esp, byte 16
pxor mm3, mm2 ;B13 (6th+)
movq [edx+4*24+8], mm3 ;B14 (7th)
mov [edx], ax
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_inter_3dne(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; this is the same as dequant_inter_3dne,
; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
; This is Athlon-optimized code (ca 100 clk per call)
ALIGN 16
cglobal dequant_h263_inter_3dne
dequant_h263_inter_3dne:
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
pxor mm0, mm0
pxor mm2, mm2
push edi
push ebx
push esi
lea edi, [mmx_mul + eax*8 - 8] ; 2*quant
mov ebx, mmx_2047
pxor mm7, mm7
movq mm3, [ecx+120] ;B2 ; c = coeff[i]
pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
psubw mm2, mm3 ;-c ;B3 (1st dep)
mov esi, mmzero
pmaxsw mm2, mm3 ;|c| ;B4 (2nd)
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
psraw mm3, 15 ; sign(c) ;B7 (2nd)
mov edx, [dword esp+ 4+12] ; data
ALIGN 8
dequant 0
dequant 1
dequant 2
dequant 3
dequant 4
paddw mm4, mm6 ;C11 mm6 free (4th+)
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
mov esi, [esp]
pxor mm5, mm4 ;C13 (6th+)
paddw mm7, mm3 ;B10 offset +negate back (3rd)
movq [edx+4*24+16], mm5 ;C14 (7th)
paddw mm2, mm7 ;B11 mm7 free (4th+)
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
mov ebx, [esp+4]
mov edi, [esp+8]
add esp, byte 12
pxor mm3, mm2 ;B13 (6th+)
movq [edx+4*24+8], mm3 ;B14 (7th)
xor eax, eax
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -