📄 quantize4_mmx.asm
字号:
pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
movq mm2, [intra_matrix + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [intra_matrix_fix + ecx*8 + 8]
pmulhw mm3, mm2
paddw mm0, mm5
paddw mm3, mm5
psrlw mm0, 2 ; mm0 >>= 1 (/4)
psrlw mm3, 2 ;
pxor mm0, mm1 ; mm0 *= sign(mm0)
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4 ;
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx,2
cmp ecx,16
jnz near .q2loop
jmp near .done
;===========================================================================
;
; uint32_t quant4_inter_mmx(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant);
;
;===========================================================================
align ALIGN
cglobal quant4_inter_mmx
quant4_inter_mmx
push ecx
push esi
push edi
mov edi, [esp + 12 + 4] ; coeff
mov esi, [esp + 12 + 8] ; data
mov eax, [esp + 12 + 12] ; quant
xor ecx, ecx
pxor mm5, mm5 ; sum
cmp al, 1
jz near .q1loop
cmp al, 2
jz near .q2loop
movq mm7, [mmx_div + eax * 8 - 8] ; divider
align ALIGN
.loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx + 8] ;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [inter_matrix + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [inter_matrix_fix + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
movq mm2, [inter_matrix + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [inter_matrix_fix + ecx*8 + 8]
pmulhw mm3, mm2
pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
pmulhw mm3, mm7 ;
psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
psrlw mm3, 1
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx, 2
cmp ecx, 16
jnz near .loop
.done
pmaddwd mm5, [mmx_one]
movq mm0, mm5
psrlq mm5, 32
paddd mm0, mm5
movd eax, mm0 ; return sum
pop edi
pop esi
pop ecx
ret
align ALIGN
.q1loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx+ 8]
;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [inter_matrix + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [inter_matrix_fix + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
movq mm2, [inter_matrix + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [inter_matrix_fix + ecx*8 + 8]
pmulhw mm3, mm2
psrlw mm0, 1 ; mm0 >>= 1 (/2)
psrlw mm3, 1 ;
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx,2
cmp ecx,16
jnz near .q1loop
jmp .done
align ALIGN
.q2loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx+ 8]
;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [inter_matrix + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [inter_matrix_fix + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
movq mm2, [inter_matrix + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [inter_matrix_fix + ecx*8 + 8]
pmulhw mm3, mm2
psrlw mm0, 2 ; mm0 >>= 1 (/2)
psrlw mm3, 2 ;
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx,2
cmp ecx,16
jnz near .q2loop
jmp .done
;===========================================================================
;
; void dequant4_intra_mmx(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar);
;
;===========================================================================
align 16
cglobal dequant4_intra_mmx
dequant4_intra_mmx
push esi
push edi
mov edi, [esp + 8 + 4] ; data
mov esi, [esp + 8 + 8] ; coeff
mov eax, [esp + 8 + 12] ; quant
movq mm7, [mmx_mul_quant + eax*8 - 8]
xor eax, eax
align 16
.loop
movq mm0, [esi + 8*eax] ; mm0 = [coeff]
pxor mm1, mm1 ; mm1 = 0
pcmpeqw mm1, mm0 ; mm1 = (0 == mm0)
pxor mm2, mm2 ; mm2 = 0
pcmpgtw mm2, mm0 ; mm2 = (0 > mm0)
pxor mm0, mm2 ; mm0 = |mm0|
psubw mm0, mm2 ; displace
pmullw mm0, mm7 ; mm0 *= quant
movq mm3, [intra_matrix + 8*eax]
movq mm4, mm0 ;
pmullw mm0, mm3 ; mm0 = low(mm0 * mm3)
pmulhw mm3, mm4 ; mm3 = high(mm0 * mm3)
movq mm4, mm0 ; mm0,mm4 = unpack(mm3, mm0)
punpcklwd mm0, mm3 ;
punpckhwd mm4, mm3 ;
psrld mm0, 3 ; mm0,mm4 /= 8
psrld mm4, 3 ;
packssdw mm0, mm4 ; mm0 = pack(mm4, mm0)
pxor mm0, mm2 ; mm0 *= sign(mm0)
psubw mm0, mm2 ; undisplace
pandn mm1, mm0 ; mm1 = ~(iszero) & mm0
%ifdef SATURATE
movq mm2, [mmx_32767_minus_2047]
movq mm6, [mmx_32768_minus_2048]
paddsw mm1, mm2
psubsw mm1, mm2
psubsw mm1, mm6
paddsw mm1, mm6
%endif
movq [edi + 8*eax], mm1 ; [data] = mm0
add eax, 1
cmp eax, 16
jnz near .loop
mov ax, [esi] ; ax = data[0]
imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar
mov [edi], ax ; data[0] = ax
%ifdef SATURATE
cmp ax, -2048
jl .set_n2048
cmp ax, 2047
jg .set_2047
%endif
pop edi
pop esi
ret
%ifdef SATURATE
.set_n2048
mov word [edi], -2048
pop edi
pop esi
ret
.set_2047
mov word [edi], 2047
pop edi
pop esi
ret
%endif
;===========================================================================
;
; void dequant4_inter_mmx(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant);
;
;===========================================================================
align 16
cglobal dequant4_inter_mmx
dequant4_inter_mmx
push esi
push edi
mov edi, [esp + 8 + 4] ; data
mov esi, [esp + 8 + 8] ; coeff
mov eax, [esp + 8 + 12] ; quant
movq mm7, [mmx_mul_quant + eax*8 - 8]
movq mm6, [mmx_one]
xor eax, eax
pxor mm5, mm5 ; mismatch sum
align 16
.loop
movq mm0, [esi + 8*eax] ; mm0 = [coeff]
pxor mm1, mm1 ; mm1 = 0
pcmpeqw mm1, mm0 ; mm1 = (0 == mm0)
pxor mm2, mm2 ; mm2 = 0
pcmpgtw mm2, mm0 ; mm2 = (0 > mm0)
pxor mm0, mm2 ; mm0 = |mm0|
psubw mm0, mm2 ; displace
psllw mm0, 1 ;
paddsw mm0, mm6 ; mm0 = 2*mm0 + 1
pmullw mm0, mm7 ; mm0 *= quant
movq mm3, [inter_matrix + 8*eax]
movq mm4, mm0
pmullw mm0, mm3 ; mm0 = low(mm0 * mm3)
pmulhw mm3, mm4 ; mm3 = high(mm0 * mm3)
movq mm4, mm0 ; mm0,mm4 = unpack(mm3, mm0)
punpcklwd mm0, mm3 ;
punpckhwd mm4, mm3 ;
psrad mm0, 4 ; mm0,mm4 /= 16
psrad mm4, 4 ;
packssdw mm0, mm4 ; mm0 = pack(mm4, mm0)
pxor mm0, mm2 ; mm0 *= sign(mm0)
psubw mm0, mm2 ; undisplace
pandn mm1, mm0 ; mm1 = ~(iszero) & mm0
;%ifdef SATURATE
movq mm2, [mmx_32767_minus_2047]
movq mm4, [mmx_32768_minus_2048]
paddsw mm1, mm2
psubsw mm1, mm2
psubsw mm1, mm4
paddsw mm1, mm4
;%endif
pxor mm5, mm1 ; mismatch
movq [edi + 8*eax], mm1 ; [data] = mm0
add eax, 1
cmp eax, 16
jnz near .loop
; mismatch control
movq mm0, mm5
movq mm1, mm5
movq mm2, mm5
psrlq mm0, 48
psrlq mm1, 32
psrlq mm2, 16
pxor mm5, mm0
pxor mm5, mm1
pxor mm5, mm2
movd eax, mm5
test eax, 0x1
jnz .done
xor word [edi + 2*63], 1
.done
pop edi
pop esi
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -