📄 quantize_h263_mmx.asm
字号:
movlhps xmm6, xmm6 ; duplicate into high 8 bytes
cmp al, 1
jz near .qes2_q1loop
.qes2_not1
movq mm0, [mmx_div + eax*8 - 8] ; divider
movq2dq xmm7, mm0
movlhps xmm7, xmm7
ALIGN 16
.qes2_loop
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st]
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd]
pxor xmm1, xmm1
pxor xmm4, xmm4
pcmpgtw xmm1, xmm0
pcmpgtw xmm4, xmm3
pxor xmm0, xmm1
pxor xmm3, xmm4
psubw xmm0, xmm1
psubw xmm3, xmm4
psubusw xmm0, xmm6
psubusw xmm3, xmm6
pmulhw xmm0, xmm7
pmulhw xmm3, xmm7
paddw xmm5, xmm0
pxor xmm0, xmm1
paddw xmm5, xmm3
pxor xmm3, xmm4
psubw xmm0, xmm1
psubw xmm3, xmm4
movdqa [edi + ecx*8], xmm0
movdqa [edi + ecx*8 + 16], xmm3
add ecx, 4
cmp ecx, 16
jnz .qes2_loop
.qes2_done
movdqu xmm6, [plus_one]
pmaddwd xmm5, xmm6
movhlps xmm6, xmm5
paddd xmm5, xmm6
movdq2q mm0, xmm5
movq mm5, mm0
psrlq mm5, 32
paddd mm0, mm5
movd eax, mm0 ; return sum
pop edi
pop esi
ret
ALIGN 16
.qes2_q1loop
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st]
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd]
pxor xmm1, xmm1
pxor xmm4, xmm4
pcmpgtw xmm1, xmm0
pcmpgtw xmm4, xmm3
pxor xmm0, xmm1
pxor xmm3, xmm4
psubw xmm0, xmm1
psubw xmm3, xmm4
psubusw xmm0, xmm6
psubusw xmm3, xmm6
psrlw xmm0, 1
psrlw xmm3, 1
paddw xmm5, xmm0
pxor xmm0, xmm1
paddw xmm5, xmm3
pxor xmm3, xmm4
psubw xmm0, xmm1
psubw xmm3, xmm4
movdqa [edi + ecx*8], xmm0
movdqa [edi + ecx*8 + 16], xmm3
add ecx, 4
cmp ecx, 16
jnz .qes2_q1loop
jmp .qes2_done
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_intra_mmx(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; note: we only saturate to +2047 *before* restoring the sign.
; Hence, final clamp really is [-2048,2047]
ALIGN 16
dequant_h263_intra_mmx:
mov edx, [esp+ 4] ; data
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
mov eax, -16
ALIGN 16
.loop
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; sign(c)
pcmpgtw mm4, mm3 ; sign(c')
pxor mm2, mm2
pxor mm5, mm5
pcmpeqw mm2, mm0 ; c is zero
pcmpeqw mm5, mm3 ; c' is zero
pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
pandn mm5, mm6
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
pmullw mm0, mm7 ; *= 2Q
pmullw mm3, mm7 ; *= 2Q
paddw mm0, mm2 ; + offset
paddw mm3, mm5 ; + offset
paddw mm0, mm1 ; negate back
paddw mm3, mm4 ; negate back
; saturates to +2047
movq mm2, [mmx_32767_minus_2047]
add eax, 2
paddsw mm0, mm2
paddsw mm3, mm2
psubsw mm0, mm2
psubsw mm3, mm2
pxor mm0, mm1
pxor mm3, mm4
movq [edx + 8*eax + 8*16 - 2*8], mm0
movq [edx + 8*eax + 8*16+8 - 2*8], mm3
jnz near .loop
; deal with DC
movd mm0, [ecx]
pmullw mm0, [esp+16] ; dcscalar
movq mm2, [mmx_32767_minus_2047]
paddsw mm0, mm2
psubsw mm0, mm2
movq mm3, [mmx_32768_minus_2048]
psubsw mm0, mm3
paddsw mm0, mm3
movd eax, mm0
mov [edx], ax
xor eax, eax ; return(0);
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_intra_xmm(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; this is the same as dequant_inter_mmx, except that we're
; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
ALIGN 16
dequant_h263_intra_xmm:
mov edx, [esp+ 4] ; data
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
mov eax, -16
ALIGN 16
.loop
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; sign(c)
pcmpgtw mm4, mm3 ; sign(c')
pxor mm2, mm2
pxor mm5, mm5
pcmpeqw mm2, mm0 ; c is zero
pcmpeqw mm5, mm3 ; c' is zero
pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
pandn mm5, mm6
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
pmullw mm0, mm7 ; *= 2Q
pmullw mm3, mm7 ; *= 2Q
paddw mm0, mm2 ; + offset
paddw mm3, mm5 ; + offset
paddw mm0, mm1 ; negate back
paddw mm3, mm4 ; negate back
; saturates to +2047
movq mm2, [mmx_2047]
pminsw mm0, mm2
add eax, 2
pminsw mm3, mm2
pxor mm0, mm1
pxor mm3, mm4
movq [edx + 8*eax + 8*16 - 2*8], mm0
movq [edx + 8*eax + 8*16+8 - 2*8], mm3
jnz near .loop
; deal with DC
movd mm0, [ecx]
pmullw mm0, [esp+16] ; dcscalar
movq mm2, [mmx_32767_minus_2047]
paddsw mm0, mm2
psubsw mm0, mm2
movq mm2, [mmx_32768_minus_2048]
psubsw mm0, mm2
paddsw mm0, mm2
movd eax, mm0
mov [edx], ax
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_intra_sse2(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
ALIGN 16
dequant_h263_intra_sse2:
mov edx, [esp+ 4] ; data
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
movq mm6, [mmx_add + eax * 8 - 8]
movq mm7, [mmx_mul + eax * 8 - 8]
movq2dq xmm6, mm6
movq2dq xmm7, mm7
movlhps xmm6, xmm6
movlhps xmm7, xmm7
mov eax, -16
ALIGN 16
.loop
movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i]
movdqa xmm3, [ecx + 8*16 + 8*eax+ 16]
pxor xmm1, xmm1
pxor xmm4, xmm4
pcmpgtw xmm1, xmm0 ; sign(c)
pcmpgtw xmm4, xmm3
pxor xmm2, xmm2
pxor xmm5, xmm5
pcmpeqw xmm2, xmm0 ; c is zero
pcmpeqw xmm5, xmm3
pandn xmm2, xmm6 ; offset = isZero ? 0 : quant_add
pandn xmm5, xmm6
pxor xmm0, xmm1 ; negate if negative
pxor xmm3, xmm4
psubw xmm0, xmm1
psubw xmm3, xmm4
pmullw xmm0, xmm7 ; *= 2Q
pmullw xmm3, xmm7
paddw xmm0, xmm2 ; + offset
paddw xmm3, xmm5
paddw xmm0, xmm1 ; negate back
paddw xmm3, xmm4
; saturates to +2047
movdqa xmm2, [sse2_2047]
pminsw xmm0, xmm2
add eax, 4
pminsw xmm3, xmm2
pxor xmm0, xmm1
pxor xmm3, xmm4
movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0
movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3
jnz near .loop
; deal with DC
movd mm0, [ecx]
pmullw mm0, [esp+16] ; dcscalar
movq mm2, [mmx_32767_minus_2047]
paddsw mm0, mm2
psubsw mm0, mm2
movq mm2, [mmx_32768_minus_2048]
psubsw mm0, mm2
paddsw mm0, mm2
movd eax, mm0
mov [edx], ax
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32t dequant_h263_inter_mmx(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
ALIGN 16
dequant_h263_inter_mmx:
mov edx, [esp+ 4] ; data
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
mov eax, -16
ALIGN 16
.loop
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; sign(c)
pcmpgtw mm4, mm3 ; sign(c')
pxor mm2, mm2
pxor mm5, mm5
pcmpeqw mm2, mm0 ; c is zero
pcmpeqw mm5, mm3 ; c' is zero
pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
pandn mm5, mm6
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
pmullw mm0, mm7 ; *= 2Q
pmullw mm3, mm7 ; *= 2Q
paddw mm0, mm2 ; + offset
paddw mm3, mm5 ; + offset
paddw mm0, mm1 ; negate back
paddw mm3, mm4 ; negate back
; saturates to +2047
movq mm2, [mmx_32767_minus_2047]
add eax, 2
paddsw mm0, mm2
paddsw mm3, mm2
psubsw mm0, mm2
psubsw mm3, mm2
pxor mm0, mm1
pxor mm3, mm4
movq [edx + 8*eax + 8*16 - 2*8], mm0
movq [edx + 8*eax + 8*16+8 - 2*8], mm3
jnz near .loop
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_inter_xmm(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; this is the same as dequant_inter_mmx,
; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
ALIGN 16
dequant_h263_inter_xmm:
mov edx, [esp+ 4] ; data
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
mov eax, -16
ALIGN 16
.loop
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; sign(c)
pcmpgtw mm4, mm3 ; sign(c')
pxor mm2, mm2
pxor mm5, mm5
pcmpeqw mm2, mm0 ; c is zero
pcmpeqw mm5, mm3 ; c' is zero
pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
pandn mm5, mm6
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
pmullw mm0, mm7 ; *= 2Q
pmullw mm3, mm7 ; *= 2Q
paddw mm0, mm2 ; + offset
paddw mm3, mm5 ; + offset
paddw mm0, mm1 ; start restoring sign
paddw mm3, mm4 ; start restoring sign
; saturates to +2047
movq mm2, [mmx_2047]
pminsw mm0, mm2
add eax, 2
pminsw mm3, mm2
pxor mm0, mm1 ; finish restoring sign
pxor mm3, mm4 ; finish restoring sign
movq [edx + 8*eax + 8*16 - 2*8], mm0
movq [edx + 8*eax + 8*16+8 - 2*8], mm3
jnz near .loop
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_inter_sse2(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
ALIGN 16
dequant_h263_inter_sse2:
mov edx, [esp + 4] ; data
mov ecx, [esp + 8] ; coeff
mov eax, [esp + 12] ; quant
movq mm6, [mmx_add + eax * 8 - 8]
movq mm7, [mmx_mul + eax * 8 - 8]
movq2dq xmm6, mm6
movq2dq xmm7, mm7
movlhps xmm6, xmm6
movlhps xmm7, xmm7
mov eax, -16
ALIGN 16
.loop
movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i]
movdqa xmm3, [ecx + 8*16 + 8*eax + 16]
pxor xmm1, xmm1
pxor xmm4, xmm4
pcmpgtw xmm1, xmm0 ; sign(c)
pcmpgtw xmm4, xmm3
pxor xmm2, xmm2
pxor xmm5, xmm5
pcmpeqw xmm2, xmm0 ; c is zero
pcmpeqw xmm5, xmm3
pandn xmm2, xmm6
pandn xmm5, xmm6
pxor xmm0, xmm1 ; negate if negative
pxor xmm3, xmm4
psubw xmm0, xmm1
psubw xmm3, xmm4
pmullw xmm0, xmm7 ; *= 2Q
pmullw xmm3, xmm7
paddw xmm0, xmm2 ; + offset
paddw xmm3, xmm5
paddw xmm0, xmm1 ; start restoring sign
paddw xmm3, xmm4
; saturates to +2047
movdqa xmm2, [sse2_2047]
pminsw xmm0, xmm2
add eax, 4
pminsw xmm3, xmm2
pxor xmm0, xmm1 ; finish restoring sign
pxor xmm3, xmm4
movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0
movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3
jnz near .loop
xor eax, eax
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -