📄 quantize4_mmx.asm
字号:
pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] movq mm2, [intra_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [intra_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 paddw mm3, mm5 psrlw mm0, 2 ; mm0 >>= 1 (/4) psrlw mm3, 2 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz near .q2loop jmp near .done;===========================================================================;; uint32_t quant4_inter_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant);;;===========================================================================align ALIGNcglobal quant4_inter_mmx quant4_inter_mmx push ecx push esi push edi mov edi, [esp + 12 + 4] ; coeff mov esi, [esp + 12 + 8] ; data mov eax, [esp + 12 + 12] ; quant xor ecx, ecx pxor mm5, mm5 ; sum cmp al, 1 jz near .q1loop cmp al, 2 jz near .q2loop movq mm7, [mmx_div + eax * 8 - 8] ; divideralign ALIGN.loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 movq mm2, [inter_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 movq mm2, [inter_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] movq mm2, [inter_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [inter_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17 psrlw mm3, 1 paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx, 2 cmp ecx, 16 jnz near .loop.done pmaddwd mm5, [mmx_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop edi pop esi pop ecx retalign ALIGN.q1loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx+ 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 movq mm2, [inter_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 movq mm2, [inter_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] movq mm2, [inter_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [inter_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz near .q1loop jmp .donealign ALIGN.q2loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx+ 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 movq mm2, [inter_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 movq mm2, [inter_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] movq mm2, [inter_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [inter_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 psrlw mm0, 2 ; mm0 >>= 1 (/2) psrlw mm3, 2 ; paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz near .q2loop jmp .done;===========================================================================;; void dequant4_intra_mmx(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar);;;===========================================================================align 16cglobal dequant4_intra_mmxdequant4_intra_mmx push esi push edi mov edi, [esp + 8 + 4] ; data mov esi, [esp + 8 + 8] ; coeff mov eax, [esp + 8 + 12] ; quant movq mm7, [mmx_mul_quant + eax*8 - 8] xor eax, eaxalign 16 .loop movq mm0, [esi + 8*eax] ; mm0 = [coeff] pxor mm1, mm1 ; mm1 = 0 pcmpeqw mm1, mm0 ; mm1 = (0 == mm0) pxor mm2, mm2 ; mm2 = 0 pcmpgtw mm2, mm0 ; mm2 = (0 > mm0) pxor mm0, mm2 ; mm0 = |mm0| psubw mm0, mm2 ; displace pmullw mm0, mm7 ; mm0 *= quant movq mm3, [intra_matrix + 8*eax] movq mm4, mm0 ; pmullw mm0, mm3 ; mm0 = low(mm0 * mm3) pmulhw mm3, mm4 ; mm3 = high(mm0 * mm3) movq mm4, mm0 ; mm0,mm4 = unpack(mm3, mm0) punpcklwd mm0, mm3 ; punpckhwd mm4, mm3 ; psrld mm0, 3 ; mm0,mm4 /= 8 psrld mm4, 3 ; packssdw mm0, mm4 ; mm0 = pack(mm4, mm0) pxor mm0, mm2 ; mm0 *= sign(mm0) psubw mm0, mm2 ; undisplace pandn mm1, mm0 ; mm1 = ~(iszero) & mm0%ifdef SATURATE movq mm2, [mmx_32767_minus_2047] movq mm6, [mmx_32768_minus_2048] paddsw mm1, mm2 psubsw mm1, mm2 psubsw mm1, mm6 paddsw mm1, mm6%endif movq [edi + 8*eax], mm1 ; [data] = mm0 add eax, 1 cmp eax, 16 jnz near .loop mov ax, [esi] ; ax = data[0] imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar mov [edi], ax ; data[0] = ax%ifdef SATURATE cmp ax, -2048 jl .set_n2048 cmp ax, 2047 jg .set_2047%endif pop edi pop esi ret%ifdef SATURATE.set_n2048 mov word [edi], -2048 pop edi pop esi ret .set_2047 mov word [edi], 2047 pop edi pop esi ret%endif;===========================================================================;; void dequant4_inter_mmx(int16_t * data,; const int16_t * const coeff,; const uint32_t quant);;;===========================================================================align 16cglobal dequant4_inter_mmxdequant4_inter_mmx push esi push edi mov edi, [esp + 8 + 4] ; data mov esi, [esp + 8 + 8] ; coeff mov eax, [esp + 8 + 12] ; quant movq mm7, [mmx_mul_quant + eax*8 - 8] movq mm6, [mmx_one] xor eax, eax pxor mm5, mm5 ; mismatch sumalign 16 .loop movq mm0, [esi + 8*eax] ; mm0 = [coeff] pxor mm1, mm1 ; mm1 = 0 pcmpeqw mm1, mm0 ; mm1 = (0 == mm0) pxor mm2, mm2 ; mm2 = 0 pcmpgtw mm2, mm0 ; mm2 = (0 > mm0) pxor mm0, mm2 ; mm0 = |mm0| psubw mm0, mm2 ; displace psllw mm0, 1 ; paddsw mm0, mm6 ; mm0 = 2*mm0 + 1 pmullw mm0, mm7 ; mm0 *= quant movq mm3, [inter_matrix + 8*eax] movq mm4, mm0 pmullw mm0, mm3 ; mm0 = low(mm0 * mm3) pmulhw mm3, mm4 ; mm3 = high(mm0 * mm3) movq mm4, mm0 ; mm0,mm4 = unpack(mm3, mm0) punpcklwd mm0, mm3 ; punpckhwd mm4, mm3 ; psrad mm0, 4 ; mm0,mm4 /= 16 psrad mm4, 4 ; packssdw mm0, mm4 ; mm0 = pack(mm4, mm0) pxor mm0, mm2 ; mm0 *= sign(mm0) psubw mm0, mm2 ; undisplace pandn mm1, mm0 ; mm1 = ~(iszero) & mm0;%ifdef SATURATE movq mm2, [mmx_32767_minus_2047] movq mm4, [mmx_32768_minus_2048] paddsw mm1, mm2 psubsw mm1, mm2 psubsw mm1, mm4 paddsw mm1, mm4;%endif pxor mm5, mm1 ; mismatch movq [edi + 8*eax], mm1 ; [data] = mm0 add eax, 1 cmp eax, 16 jnz near .loop ; mismatch control movq mm0, mm5 movq mm1, mm5 movq mm2, mm5 psrlq mm0, 48 psrlq mm1, 32 psrlq mm2, 16 pxor mm5, mm0 pxor mm5, mm1 pxor mm5, mm2 movd eax, mm5 test eax, 0x1 jnz .done xor word [edi + 2*63], 1.done pop edi pop esi ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -