📄 quantize_h263_mmx.asm
字号:
movdqu xmm6, [plus_one] pmaddwd xmm5, xmm6 movhlps xmm6, xmm5 paddd xmm5, xmm6 movdq2q mm0, xmm5 movq mm5, mm0 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop edi pop esi retALIGN 16.qes2_q1loop movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] pxor xmm1, xmm1 pxor xmm4, xmm4 pcmpgtw xmm1, xmm0 pcmpgtw xmm4, xmm3 pxor xmm0, xmm1 pxor xmm3, xmm4 psubw xmm0, xmm1 psubw xmm3, xmm4 psubusw xmm0, xmm6 psubusw xmm3, xmm6 psrlw xmm0, 1 psrlw xmm3, 1 paddw xmm5, xmm0 pxor xmm0, xmm1 paddw xmm5, xmm3 pxor xmm3, xmm4 psubw xmm0, xmm1 psubw xmm3, xmm4 movdqa [edi + ecx*8], xmm0 movdqa [edi + ecx*8 + 16], xmm3 add ecx, 4 cmp ecx, 16 jnz .qes2_q1loop jmp .qes2_done.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_h263_intra_mmx(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16dequant_h263_intra_mmx: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff pcmpeqw mm0,mm0 movq mm6, [mmx_quant + ecx*8] ; quant shl ecx,31 ; quant & 1 ? 0 : - 1 movq mm7,mm6 movq mm5,mm0 movd mm1,ecx mov edx, [esp+ 4] ; data psllw mm0,mm1 paddw mm7,mm7 ; 2*quant paddw mm6,mm0 ; quant-1 psllw mm5,12 mov ecx,8 psrlw mm5,1.loop: movq mm0,[eax] pxor mm2,mm2 pxor mm4,mm4 pcmpgtw mm2,mm0 pcmpeqw mm4,mm0 pmullw mm0,mm7 ; * 2 * quant movq mm1,[eax+8] psubw mm0,mm2 pxor mm2,mm6 pxor mm3,mm3 pandn mm4,mm2 pxor mm2,mm2 pcmpgtw mm3,mm1 pcmpeqw mm2,mm1 pmullw mm1,mm7 paddw mm0,mm4 psubw mm1,mm3 pxor mm3,mm6 pandn mm2,mm3 paddsw mm0, mm5 ; saturate paddw mm1,mm2 paddsw mm1, mm5 psubsw mm0, mm5 psubsw mm1, mm5 psubsw mm0, mm5 psubsw mm1, mm5 paddsw mm0, mm5 paddsw mm1, mm5 movq [edx],mm0 lea eax,[eax+16] movq [edx+8],mm1 dec ecx lea edx,[edx+16] jne .loop ; deal with DC mov eax, [esp+ 8] ; coeff movd mm1,[esp+16] ; dcscalar movd mm0,[eax] ; coeff[0] pmullw mm0,mm1 ; * dcscalar mov edx, [esp+ 4] ; data paddsw mm0, mm5 ; saturate + psubsw mm0, mm5 psubsw mm0, mm5 ; saturate - paddsw mm0, mm5 movd eax,mm0 mov [edx], ax xor eax, eax ; return 0 ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_h263_intra_xmm(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;----------------------------------------------------------------------------- ALIGN 16 dequant_h263_intra_xmm: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff movd mm6,ecx ; quant pcmpeqw mm0,mm0 pshufw mm6,mm6,0 ; all quant shl ecx,31 movq mm5,mm0 movq mm7,mm6 movd mm1,ecx mov edx, [esp+ 4] ; data psllw mm0,mm1 ; quant & 1 ? 0 : - 1 movq mm4,mm5 paddw mm7,mm7 ; quant*2 paddw mm6,mm0 ; quant-1 psrlw mm4,5 ; mm4=2047 mov ecx,8 pxor mm5,mm4 ; mm5=-2048 .loop: movq mm0,[eax] pxor mm2,mm2 pxor mm3,mm3 pcmpgtw mm2,mm0 pcmpeqw mm3,mm0 ; if coeff==0... pmullw mm0,mm7 ; * 2 * quant movq mm1,[eax+8] psubw mm0,mm2 pxor mm2,mm6 pandn mm3,mm2 ; ...then data=0 pxor mm2,mm2 paddw mm0,mm3 pxor mm3,mm3 pcmpeqw mm2,mm1 pcmpgtw mm3,mm1 pmullw mm1,mm7 pminsw mm0,mm4 psubw mm1,mm3 pxor mm3,mm6 pandn mm2,mm3 paddw mm1,mm2 pmaxsw mm0,mm5 pminsw mm1,mm4 movq [edx],mm0 pmaxsw mm1,mm5 lea eax,[eax+16] movq [edx+8],mm1 dec ecx lea edx,[edx+16] jne .loop ; deal with DC mov eax, [esp+ 8] ; coeff movd mm1,[esp+16] ; dcscalar movd mm0, [eax] pmullw mm0, mm1 mov edx, [esp+ 4] ; data pminsw mm0,mm4 pmaxsw mm0,mm5 movd eax, mm0 mov [edx], ax xor eax, eax ; return 0 ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_h263_intra_sse2(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16 dequant_h263_intra_sse2: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff movd xmm6,ecx ; quant shl ecx,31 pshuflw xmm6,xmm6,0 pcmpeqw xmm0,xmm0 movlhps xmm6,xmm6 ; all quant movd xmm1,ecx movdqa xmm5,xmm0 movdqa xmm7,xmm6 mov edx, [esp+ 4] ; data paddw xmm7,xmm7 ; quant *2 psllw xmm0,xmm1 ; quant & 1 ? 0 : - 1 movdqa xmm4,xmm5 paddw xmm6,xmm0 ; quant-1 psrlw xmm4,5 ; 2047 mov ecx,4 pxor xmm5,xmm4 ; mm5=-2048 .loop: movdqa xmm0,[eax] pxor xmm2,xmm2 pxor xmm3,xmm3 pcmpgtw xmm2,xmm0 pcmpeqw xmm3,xmm0 pmullw xmm0,xmm7 ; * 2 * quant movdqa xmm1,[eax+16] psubw xmm0,xmm2 pxor xmm2,xmm6 pandn xmm3,xmm2 pxor xmm2,xmm2 paddw xmm0,xmm3 pxor xmm3,xmm3 pcmpeqw xmm2,xmm1 pcmpgtw xmm3,xmm1 pmullw xmm1,xmm7 pminsw xmm0,xmm4 psubw xmm1,xmm3 pxor xmm3,xmm6 pandn xmm2,xmm3 paddw xmm1,xmm2 pmaxsw xmm0,xmm5 pminsw xmm1,xmm4 movdqa [edx],xmm0 pmaxsw xmm1,xmm5 lea eax,[eax+32] movdqa [edx+16],xmm1 dec ecx lea edx,[edx+32] jne .loop ; deal with DC mov eax, [esp+ 8] ; coeff movsx eax,word [eax] imul dword [esp+16] ; dcscalar mov edx, [esp+ 4] ; data movd xmm0,eax pminsw xmm0,xmm4 pmaxsw xmm0,xmm5 movd eax,xmm0 mov [edx], ax xor eax, eax ; return 0 ret.endfunc;-----------------------------------------------------------------------------;; uint32t dequant_h263_inter_mmx(int16_t * data,; const int16_t * const coeff,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16dequant_h263_inter_mmx: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff pcmpeqw mm0,mm0 movq mm6, [mmx_quant + ecx*8] ; quant shl ecx,31 ; odd/even movq mm7,mm6 movd mm1,ecx mov edx, [esp+ 4] ; data movq mm5,mm0 psllw mm0,mm1 ; quant & 1 ? 0 : - 1 paddw mm7,mm7 ; quant*2 paddw mm6,mm0 ; quant & 1 ? quant : quant - 1 psllw mm5,12 mov ecx,8 psrlw mm5,1 ; 32767-2047 (32768-2048).loop: movq mm0,[eax] pxor mm4,mm4 pxor mm2,mm2 pcmpeqw mm4,mm0 ; if coeff==0... pcmpgtw mm2,mm0 pmullw mm0,mm7 ; * 2 * quant pxor mm3,mm3 psubw mm0,mm2 movq mm1,[eax+8] pxor mm2,mm6 pcmpgtw mm3,mm1 pandn mm4,mm2 ; ... then data==0 pmullw mm1,mm7 pxor mm2,mm2 pcmpeqw mm2,mm1 psubw mm1,mm3 pxor mm3,mm6 pandn mm2,mm3 paddw mm0,mm4 paddw mm1,mm2 paddsw mm0, mm5 ; saturate paddsw mm1, mm5 psubsw mm0, mm5 psubsw mm1, mm5 psubsw mm0, mm5 psubsw mm1, mm5 paddsw mm0, mm5 paddsw mm1, mm5 movq [edx],mm0 lea eax,[eax+16] movq [edx+8],mm1 dec ecx lea edx,[edx+16] jne .loop xor eax, eax ; return 0 ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_h263_inter_xmm(int16_t * data,; const int16_t * const coeff,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16 dequant_h263_inter_xmm: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff pcmpeqw mm0,mm0 movq mm6, [mmx_quant + ecx*8] ; quant shl ecx,31 movq mm5,mm0 movd mm1,ecx movq mm7,mm6 psllw mm0,mm1 mov edx, [esp+ 4] ; data movq mm4,mm5 paddw mm7,mm7 paddw mm6,mm0 ; quant-1 psrlw mm4,5 mov ecx,8 pxor mm5,mm4 ; mm5=-2048 .loop: movq mm0,[eax] pxor mm3,mm3 pxor mm2,mm2 pcmpeqw mm3,mm0 pcmpgtw mm2,mm0 pmullw mm0,mm7 ; * 2 * quant pandn mm3,mm6 movq mm1,[eax+8] psubw mm0,mm2 pxor mm2,mm3 pxor mm3,mm3 paddw mm0,mm2 pxor mm2,mm2 pcmpgtw mm3,mm1 pcmpeqw mm2,mm1 pmullw mm1,mm7 pandn mm2,mm6 psubw mm1,mm3 pxor mm3,mm2 paddw mm1,mm3 pminsw mm0,mm4 pminsw mm1,mm4 pmaxsw mm0,mm5 pmaxsw mm1,mm5 movq [edx],mm0 lea eax,[eax+16] movq [edx+8],mm1 dec ecx lea edx,[edx+16] jne .loop xor eax, eax ; return 0 ret.endfunc ;-----------------------------------------------------------------------------;; uint32_t dequant_h263_inter_sse2(int16_t * data,; const int16_t * const coeff,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16dequant_h263_inter_sse2: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff movq xmm6, [mmx_quant + ecx*8] ; quant inc ecx pcmpeqw xmm5,xmm5 and ecx,1 movlhps xmm6,xmm6 movd xmm0,ecx movdqa xmm7,xmm6 pshuflw xmm0,xmm0,0 movdqa xmm4,xmm5 mov edx, [esp+ 4] ; data movlhps xmm0,xmm0 paddw xmm7,xmm7 psubw xmm6,xmm0 psrlw xmm4,5 ; 2047 mov ecx,4 pxor xmm5,xmm4 ; mm5=-2048 .loop: movdqa xmm0,[eax] pxor xmm3,xmm3 pxor xmm2,xmm2 pcmpeqw xmm3,xmm0 pcmpgtw xmm2,xmm0 pmullw xmm0,xmm7 ; * 2 * quant pandn xmm3,xmm6 movdqa xmm1,[eax+16] psubw xmm0,xmm2 pxor xmm2,xmm3 pxor xmm3,xmm3 paddw xmm0,xmm2 pxor xmm2,xmm2 pcmpgtw xmm3,xmm1 pcmpeqw xmm2,xmm1 pmullw xmm1,xmm7 pandn xmm2,xmm6 psubw xmm1,xmm3 pxor xmm3,xmm2 paddw xmm1,xmm3 pminsw xmm0,xmm4 pminsw xmm1,xmm4 pmaxsw xmm0,xmm5 pmaxsw xmm1,xmm5 movdqa [edx],xmm0 lea eax,[eax+32] movdqa [edx+16],xmm1 dec ecx lea edx,[edx+32] jne .loop xor eax, eax ; return 0 ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -