📄 quantize_mpeg_xmm.asm
字号:
jg near .lloop
nop
ALIGN 16
.loop
movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
psubw mm0, mm1 ;-mm1
movq mm4, [eax + 8*esi + 120] ;
psubw mm3, mm4 ;-mm4
pmaxsw mm0, mm1 ;|src|
pmaxsw mm3, mm4
nop2
psraw mm1, 15 ;sign src
psraw mm4, 15
psllw mm0, 4 ; level << 4
psllw mm3, 4 ;
paddw mm0, [edi + 640 + 8*esi+112]
paddw mm3, [edi + 640 + 8*esi+120]
movq mm5, [edi + 896 + 8*esi+112]
movq mm7, [edi + 896 + 8*esi+120]
pmulhuw mm5, mm0
pmulhuw mm7, mm3
mov esp, esp
movq mm2, [edi + 512 + 8*esi+112]
movq mm6, [edi + 512 + 8*esi+120]
pmullw mm2, mm5
pmullw mm6, mm7
psubw mm0, mm2
psubw mm3, mm6
movq mm2, [byte ebx]
movq mm6, [mmx_divs + ecx * 8 - 8]
pmulhuw mm0, [edi + 768 + 8*esi+112]
pmulhuw mm3, [edi + 768 + 8*esi+120]
paddw mm2, [ebx+8] ;sum
paddw mm5, mm0
paddw mm7, mm3
pxor mm0, mm0
pxor mm3, mm3
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
add esi, byte 2
paddw mm2, mm5 ;sum += x1
movq [ebx], mm7 ;store x2
pxor mm5, mm1 ; mm0 *= sign(mm0)
pxor mm7, mm4 ;
psubw mm5, mm1 ; undisplace
psubw mm7, mm4 ;
db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
movq [edx + 8*esi+112-16], mm5
movq [edx + 8*esi +120-16], mm7
jng near .loop
.done
; calculate data[0] // (int32_t)dcscalar)
paddw mm2, [ebx]
mov ebx, [esp+24]
mov edi, [esp+4+24]
mov esi, [esp+8+24]
add esp, byte 12+24
pmaddwd mm2, [mmx_one]
punpckldq mm0, mm2 ;get low dw to mm0:high
paddd mm0,mm2
punpckhdq mm0, mm0 ;get result to low
movd eax, mm0
ret
ALIGN 16
.q1loop
movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
psubw mm0, mm1 ;-mm1
movq mm4, [eax + 8*esi+120]
psubw mm3, mm4 ;-mm4
pmaxsw mm0, mm1 ;|src|
pmaxsw mm3, mm4
nop2
psraw mm1, 15 ; sign src
psraw mm4, 15
psllw mm0, 4 ; level << 4
psllw mm3, 4
paddw mm0, [edi + 640 + 8*esi+112] ;mm0 is to be divided
paddw mm3, [edi + 640 + 8*esi+120] ; inter1 contains fix for division by 1
movq mm5, [edi + 896 + 8*esi+112] ;with rounding down
movq mm7, [edi + 896 + 8*esi+120]
pmulhuw mm5, mm0
pmulhuw mm7, mm3 ;mm7: first approx of division
mov esp, esp
movq mm2, [edi + 512 + 8*esi+112]
movq mm6, [edi + 512 + 8*esi+120] ; divs for q<=16
pmullw mm2, mm5 ;test value <= original
pmullw mm6, mm7
psubw mm0, mm2 ;mismatch
psubw mm3, mm6
movq mm2, [byte ebx]
pmulhuw mm0, [edi + 768 + 8*esi+112] ;correction
pmulhuw mm3, [edi + 768 + 8*esi+120]
paddw mm2, [ebx+8] ;sum
paddw mm5, mm0 ;final result
paddw mm7, mm3
pxor mm0, mm0
pxor mm3, mm3
psrlw mm5, 1 ; (level ) /2 (quant = 1)
psrlw mm7, 1
add esi, byte 2
paddw mm2, mm5 ;sum += x1
movq [ebx], mm7 ;store x2
pxor mm5, mm1 ; mm0 *= sign(mm0)
pxor mm7, mm4 ;
psubw mm5, mm1 ; undisplace
psubw mm7, mm4 ;
movq [ebx+8], mm2 ;store sum
movq [edx + 8*esi+112-16], mm5
movq [edx + 8*esi +120-16], mm7
jng near .q1loop
jmp near .done
ALIGN 8
.lloop
movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
psubw mm0,mm1 ;-mm1
movq mm4, [eax + 8*esi+120]
psubw mm3,mm4 ;-mm4
pmaxsw mm0,mm1 ;|src|
pmaxsw mm3,mm4
nop2
psraw mm1,15 ;sign src
psraw mm4,15
psllw mm0, 4 ; level << 4
psllw mm3, 4 ;
paddw mm0, [edi + 640 + 8*esi+112] ;mm0 is to be divided inter1 contains fix for division by 1
paddw mm3, [edi + 640 + 8*esi+120]
movq mm5,[edi + 896 + 8*esi+112]
movq mm7,[edi + 896 + 8*esi+120]
pmulhuw mm5,mm0
pmulhuw mm7,mm3 ;mm7: first approx of division
mov esp,esp
movq mm2,[edi + 512 + 8*esi+112]
movq mm6,[edi + 512 + 8*esi+120]
pmullw mm2,mm5 ;test value <= original
pmullw mm6,mm7
psubw mm0,mm2 ;mismatch
psubw mm3,mm6
movq mm2,[byte ebx]
movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16
pmulhuw mm0,[edi + 768 + 8*esi+112] ;correction
pmulhuw mm3,[edi + 768 + 8*esi+120]
paddw mm2,[ebx+8] ;sum
paddw mm5,mm0 ;final result
paddw mm7,mm3
pxor mm0,mm0
pxor mm3,mm3
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
add esi,byte 2
psrlw mm5, 1 ; (level ) / (2*quant)
paddw mm2,mm5 ;sum += x1
psrlw mm7, 1
movq [ebx],mm7 ;store x2
pxor mm5, mm1 ; mm0 *= sign(mm0)
pxor mm7, mm4 ;
psubw mm5, mm1 ; undisplace
psubw mm7, mm4 ;
db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
movq [edx + 8*esi+112-16], mm5
movq [edx + 8*esi +120-16], mm7
jng near .lloop
jmp near .done
;-----------------------------------------------------------------------------
;
; uint32_t dequant_mpeg_intra_3dne(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; Note: in order to saturate 'easily', we pre-shift the quantifier
; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to
; build a saturating mask. It is non-zero only when an overflow occured.
; We thus avoid packing/unpacking toward double-word.
; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
; and quant in [1..31].
;
%macro DEQUANT4INTRAMMX 1
movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i]
movq mm4, [ecx+ 16 * %1 +8] ; mm3 = c' = coeff[i+1]
psubw mm0, mm1
psubw mm3, mm4
pmaxsw mm0, mm1
pmaxsw mm3, mm4
psraw mm1, 15
psraw mm4, 15
%if %1
movq mm2, [eax+8] ;preshifted quant
movq mm7, [eax+8]
%endif
pmullw mm2, [edi + 16 * %1 ] ; matrix[i]*quant
pmullw mm7, [edi + 16 * %1 +8] ; matrix[i+1]*quant
movq mm5, mm0
movq mm6, mm3
pmulhw mm0, mm2 ; high of coeff*(matrix*quant)
pmulhw mm3, mm7 ; high of coeff*(matrix*quant)
pmullw mm2, mm5 ; low of coeff*(matrix*quant)
pmullw mm7, mm6 ; low of coeff*(matrix*quant)
pcmpgtw mm0, [eax]
pcmpgtw mm3, [eax]
paddusw mm2, mm0
paddusw mm7, mm3
psrlw mm2, 5
psrlw mm7, 5
pxor mm2, mm1 ; start negating back
pxor mm7, mm4 ; start negating back
psubusw mm1, mm0
psubusw mm4, mm3
movq mm0, [eax] ;zero
movq mm3, [eax] ;zero
psubw mm2, mm1 ; finish negating back
psubw mm7, mm4 ; finish negating back
movq [byte edx + 16 * %1], mm2 ; data[i]
movq [edx + 16 * %1 +8], mm7 ; data[i+1]
%endmacro
ALIGN 16
dequant_mpeg_intra_3dne:
mov eax, [esp+12] ; quant
mov ecx, [esp+8] ; coeff
movq mm7, [mmx_mul_quant + eax*8 - 8]
psllw mm7, 2 ; << 2. See comment.
mov edx, [esp+4] ; data
push ebx
movsx ebx, word [ecx]
pxor mm0, mm0
pxor mm3, mm3
push esi
lea eax, [esp-28]
sub esp, byte 32
and eax, byte -8 ;points to qword ALIGNed space on stack
movq [eax], mm0
movq [eax+8], mm7
imul ebx, [esp+16+8+32] ; dcscalar
movq mm2, mm7
push edi
mov edi, [esp + 32 + 12 + 20] ; mpeg_quant_matrices
ALIGN 4
DEQUANT4INTRAMMX 0
mov esi, -2048
nop
cmp ebx, esi
DEQUANT4INTRAMMX 1
cmovl ebx, esi
neg esi
sub esi, byte 1 ;2047
DEQUANT4INTRAMMX 2
cmp ebx, esi
cmovg ebx, esi
lea ebp, [byte ebp]
DEQUANT4INTRAMMX 3
mov esi, [esp+36]
mov [byte edx], bx
mov ebx, [esp+36+4]
DEQUANT4INTRAMMX 4
DEQUANT4INTRAMMX 5
DEQUANT4INTRAMMX 6
DEQUANT4INTRAMMX 7
pop edi
add esp, byte 32+8
xor eax, eax
ret
;-----------------------------------------------------------------------------
;
; uint32_t dequant_mpeg_inter_3dne(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
; It's mixed with the extraction of the absolute value.
ALIGN 16
dequant_mpeg_inter_3dne:
mov edx, [esp+ 4] ; data
mov ecx, [esp+ 8] ; coeff
mov eax, [esp+12] ; quant
movq mm7, [mmx_mul_quant + eax*8 - 8]
mov eax, -14
paddw mm7, mm7 ; << 1
pxor mm6, mm6 ; mismatch sum
push esi
push edi
mov esi, mmzero
pxor mm1, mm1
pxor mm3, mm3
mov edi, [esp + 8 + 16] ; mpeg_quant_matrices
nop
nop4
ALIGN 16
.loop
movq mm0, [ecx+8*eax + 7*16 ] ; mm0 = coeff[i]
pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
movq mm2, [ecx+8*eax + 7*16 +8] ; mm2 = coeff[i+1]
pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
paddsw mm0, mm1 ; c += sgn(c)
paddsw mm2, mm3 ; c += sgn(c')
paddw mm0, mm0 ; c *= 2
paddw mm2, mm2 ; c'*= 2
movq mm4, [esi]
movq mm5, [esi]
psubw mm4, mm0 ; -c
psubw mm5, mm2 ; -c'
psraw mm4, 16 ; mm4 = sgn(-c)
psraw mm5, 16 ; mm5 = sgn(-c')
psubsw mm0, mm4 ; c -= sgn(-c)
psubsw mm2, mm5 ; c' -= sgn(-c')
pxor mm0, mm1 ; finish changing sign if needed
pxor mm2, mm3 ; finish changing sign if needed
; we're short on register, here. Poor pairing...
movq mm4, mm7 ; (matrix*quant)
nop
pmullw mm4, [edi + 512 + 8*eax + 7*16]
movq mm5, mm4
pmulhw mm5, mm0 ; high of c*(matrix*quant)
pmullw mm0, mm4 ; low of c*(matrix*quant)
movq mm4, mm7 ; (matrix*quant)
pmullw mm4, [edi + 512 + 8*eax + 7*16 + 8]
add eax, byte 2
pcmpgtw mm5, [esi]
paddusw mm0, mm5
psrlw mm0, 5
pxor mm0, mm1 ; start restoring sign
psubusw mm1, mm5
movq mm5, mm4
pmulhw mm5, mm2 ; high of c*(matrix*quant)
pmullw mm2, mm4 ; low of c*(matrix*quant)
psubw mm0, mm1 ; finish restoring sign
pcmpgtw mm5, [esi]
paddusw mm2, mm5
psrlw mm2, 5
pxor mm2, mm3 ; start restoring sign
psubusw mm3, mm5
psubw mm2, mm3 ; finish restoring sign
movq mm1, [esi]
movq mm3, [byte esi]
pxor mm6, mm0 ; mismatch control
movq [edx + 8*eax + 7*16 -2*8 ], mm0 ; data[i]
pxor mm6, mm2 ; mismatch control
movq [edx + 8*eax + 7*16 -2*8 +8], mm2 ; data[i+1]
jng .loop
nop
; mismatch control
pshufw mm0, mm6, 01010101b
pshufw mm1, mm6, 10101010b
pshufw mm2, mm6, 11111111b
pxor mm6, mm0
pxor mm1, mm2
pxor mm6, mm1
movd eax, mm6
pop edi
and eax, byte 1
xor eax, byte 1
mov esi, [esp]
add esp, byte 4
xor word [edx + 2*63], ax
xor eax, eax
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -