📄 quantize_mpeg_mmx.asm
字号:
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [ebx + 768 + ecx*8 + 8]
pmulhw mm3, mm2
pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
pmulhw mm3, mm7 ;
psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
psrlw mm3, 1
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx, 2
cmp ecx, 16
jnz near .loop
.done
pmaddwd mm5, [mmx_one]
movq mm0, mm5
psrlq mm5, 32
paddd mm0, mm5
movd eax, mm0 ; return sum
pop ebx
pop edi
pop esi
pop ecx
ret
ALIGN 16
.q1loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx+ 8]
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [ebx + 512 + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [ebx + 768 + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
movq mm2, [ebx + 512 + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [ebx + 768 + ecx*8 + 8]
pmulhw mm3, mm2
psrlw mm0, 1 ; mm0 >>= 1 (/2)
psrlw mm3, 1 ;
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx, 2
cmp ecx, 16
jnz near .q1loop
jmp .done
ALIGN 16
.q2loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx+ 8]
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [ebx + 512 + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [ebx + 768 + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
movq mm2, [ebx + 512 + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [ebx + 768 + ecx*8 + 8]
pmulhw mm3, mm2
psrlw mm0, 2 ; mm0 >>= 1 (/2)
psrlw mm3, 2 ;
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx, 2
cmp ecx, 16
jnz near .q2loop
jmp .done
;-----------------------------------------------------------------------------
;
; uint32_t dequant_mpeg_intra_mmx(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; Note: in order to saturate 'easily', we pre-shift the quantifier
; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to
; build a saturating mask. It is non-zero only when an overflow occured.
; We thus avoid packing/unpacking toward double-word.
; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
; and quant in [1..31].
;
; The original loop is:
;
%if 0
movq mm0, [ecx+8*eax + 8*16] ; mm0 = coeff[i]
pxor mm1, mm1
pcmpgtw mm1, mm0
pxor mm0, mm1 ; change sign if negative
psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i]
movq mm2, mm7 ; mm2 = quant
pmullw mm2, [ebx + 8*eax + 8*16 ] ; matrix[i]*quant.
movq mm6, mm2
pmulhw mm2, mm0 ; high of coeff*(matrix*quant) (should be 0 if no overflow)
pmullw mm0, mm6 ; low of coeff*(matrix*quant)
pxor mm5, mm5
pcmpgtw mm2, mm5 ; otherflow?
psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise
psrlw mm0, 5
paddw mm0, mm1 ; start restoring sign
por mm0, mm2 ; saturate to 2047 if needed
pxor mm0, mm1 ; finish negating back
movq [edx + 8*eax + 8*16], mm0 ; data[i]
add eax, 1
%endif
;********************************************************************
ALIGN 16
dequant_mpeg_intra_mmx:
push ebx
mov edx, [esp + 4 + 4] ; data
mov ecx, [esp + 4 + 8] ; coeff
mov eax, [esp + 4 + 12] ; quant
mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices
movq mm7, [mmx_mul_quant + eax*8 - 8]
mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0]
psllw mm7, 2 ; << 2. See comment.
pxor mm6, mm6 ; this is a NOP
ALIGN 16
.loop
movq mm0, [ecx+8*eax + 8*16] ; mm0 = c = coeff[i]
movq mm3, [ecx+8*eax + 8*16 +8]; mm3 = c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; mm1 = sgn(c)
movq mm2, mm7 ; mm2 = quant
pcmpgtw mm4, mm3 ; mm4 = sgn(c')
pmullw mm2, [ebx + 8*eax + 8*16 ] ; matrix[i]*quant
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
; we're short on register, here. Poor pairing...
movq mm5, mm2
pmullw mm2, mm0 ; low of coeff*(matrix*quant)
pmulhw mm0, mm5 ; high of coeff*(matrix*quant)
movq mm5, mm7 ; mm2 = quant
pmullw mm5, [ebx + 8*eax + 8*16 +8] ; matrix[i+1]*quant
movq mm6, mm5
add eax,2 ; z-flag will be tested later
pmullw mm6, mm3 ; low of coeff*(matrix*quant)
pmulhw mm3, mm5 ; high of coeff*(matrix*quant)
pcmpgtw mm0, [zero]
paddusw mm2, mm0
psrlw mm2, 5
pcmpgtw mm3, [zero]
paddusw mm6, mm3
psrlw mm6, 5
pxor mm2, mm1 ; start negating back
pxor mm6, mm4 ; start negating back
psubusw mm1, mm0
psubusw mm4, mm3
psubw mm2, mm1 ; finish negating back
psubw mm6, mm4 ; finish negating back
movq [edx + 8*eax + 8*16 -2*8 ], mm2 ; data[i]
movq [edx + 8*eax + 8*16 -2*8 +8], mm6 ; data[i+1]
jnz near .loop
; deal with DC
movd mm0, [ecx]
pmullw mm0, [esp + 4 + 16] ; dcscalar
movq mm2, [mmx_32767_minus_2047]
paddsw mm0, mm2
psubsw mm0, mm2
movq mm2, [mmx_32768_minus_2048]
psubsw mm0, mm2
paddsw mm0, mm2
movd eax, mm0
mov [edx], ax
xor eax, eax
pop ebx
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_mpeg_inter_mmx(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
; It's mixed with the extraction of the absolute value.
ALIGN 16
dequant_mpeg_inter_mmx:
push ebx
mov edx, [esp + 4 + 4] ; data
mov ecx, [esp + 4 + 8] ; coeff
mov eax, [esp + 4 + 12] ; quant
mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices
movq mm7, [mmx_mul_quant + eax*8 - 8]
mov eax, -16
paddw mm7, mm7 ; << 1
pxor mm6, mm6 ; mismatch sum
ALIGN 16
.loop
movq mm0, [ecx+8*eax + 8*16 ] ; mm0 = coeff[i]
movq mm2, [ecx+8*eax + 8*16 +8] ; mm2 = coeff[i+1]
add eax, 2
pxor mm1, mm1
pxor mm3, mm3
pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
paddsw mm0, mm1 ; c += sgn(c)
paddsw mm2, mm3 ; c += sgn(c')
paddw mm0, mm0 ; c *= 2
paddw mm2, mm2 ; c'*= 2
pxor mm4, mm4
pxor mm5, mm5
psubw mm4, mm0 ; -c
psubw mm5, mm2 ; -c'
psraw mm4, 16 ; mm4 = sgn(-c)
psraw mm5, 16 ; mm5 = sgn(-c')
psubsw mm0, mm4 ; c -= sgn(-c)
psubsw mm2, mm5 ; c' -= sgn(-c')
pxor mm0, mm1 ; finish changing sign if needed
pxor mm2, mm3 ; finish changing sign if needed
; we're short on register, here. Poor pairing...
movq mm4, mm7 ; (matrix*quant)
pmullw mm4, [ebx + 512 + 8*eax + 8*16 -2*8]
movq mm5, mm4
pmulhw mm5, mm0 ; high of c*(matrix*quant)
pmullw mm0, mm4 ; low of c*(matrix*quant)
movq mm4, mm7 ; (matrix*quant)
pmullw mm4, [ebx + 512 + 8*eax + 8*16 -2*8 + 8]
pcmpgtw mm5, [zero]
paddusw mm0, mm5
psrlw mm0, 5
pxor mm0, mm1 ; start restoring sign
psubusw mm1, mm5
movq mm5, mm4
pmulhw mm5, mm2 ; high of c*(matrix*quant)
pmullw mm2, mm4 ; low of c*(matrix*quant)
psubw mm0, mm1 ; finish restoring sign
pcmpgtw mm5, [zero]
paddusw mm2, mm5
psrlw mm2, 5
pxor mm2, mm3 ; start restoring sign
psubusw mm3, mm5
psubw mm2, mm3 ; finish restoring sign
pxor mm6, mm0 ; mismatch control
movq [edx + 8*eax + 8*16 -2*8 ], mm0 ; data[i]
pxor mm6, mm2 ; mismatch control
movq [edx + 8*eax + 8*16 -2*8 +8], mm2 ; data[i+1]
jnz near .loop
; mismatch control
movq mm0, mm6
psrlq mm0, 48
movq mm1, mm6
movq mm2, mm6
psrlq mm1, 32
pxor mm6, mm0
psrlq mm2, 16
pxor mm6, mm1
pxor mm6, mm2
movd eax, mm6
and eax, 1
xor eax, 1
xor word [edx + 2*63], ax
xor eax, eax
pop ebx
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -