📄 skl_mb_mmx.asm
字号:
retalign 16Skl_Copy_8x8_FH_Rnd1_MMX: PROLOG Rounder0_MMX, 0 COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX ret%macro COPY_16x8_FH_MMX 0 movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 MIX2 movq [ecx], mm0 movq mm0, [eax+8] movq mm2, [eax+9] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX2 movq [ecx+8], mm0%endmacroalign 16Skl_Copy_16x8_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX retalign 16Skl_Copy_16x8_FH_Rnd1_MMX: PROLOG Rounder0_MMX, 0 COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX retalign 16Skl_H_Pass_2Taps_MMX:Skl_Copy_16x16_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX lea ecx,[ecx+edx] COPY_16x8_FH_MMX ret;//////////////////////////////////////////////////////////////////////%macro COPY_HF_MMX 0 movq mm0, [eax] movq mm2, [eax+edx] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX2 movq [ecx], mm0%endmacroalign 16Skl_Copy_8x4_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX retalign 16Skl_Copy_8x4_HF_Rnd1_MMX: PROLOG Rounder0_MMX, 0 COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX retalign 16Skl_Copy_8x8_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX retalign 16Skl_Copy_8x8_HF_Rnd1_MMX: PROLOG Rounder0_MMX, 0 COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX lea ecx,[ecx+edx] COPY_HF_MMX ret%macro COPY_16x8_HF_MMX 0 movq mm0, [eax] movq mm2, [eax+edx] movq mm1, mm0 movq mm3, mm2 MIX2 movq [ecx], mm0 movq mm0, [eax+8] movq mm2, [eax+edx+8] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX2 movq [ecx+8], mm0%endmacroalign 16Skl_Copy_16x8_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX retalign 16Skl_Copy_16x8_HF_Rnd1_MMX: PROLOG Rounder0_MMX, 0 COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX retalign 16Skl_V_Pass_2Taps_MMX:Skl_Copy_16x16_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX lea ecx,[ecx+edx] COPY_16x8_HF_MMX ret;//////////////////////////////////////////////////////////////////////%macro COPY_HH_MMX 0 lea eax,[eax+edx] ; transfert prev line to mm0/mm1 movq mm0, mm2 movq mm1, mm3 ; load new line in mm2/mm3 movq mm2, [eax] movq mm4, [eax+1] movq mm3, mm2 movq mm5, mm4 punpcklbw mm2, mm6 paddusw mm0, mm7 ; rounder punpcklbw mm4, mm6 paddusw mm1, mm7 ; rounder punpckhbw mm3, mm6 paddusw mm2, mm4 punpckhbw mm5, mm6 paddusw mm0, mm2 paddusw mm3, mm5 psrlw mm0, 2 paddusw mm1, mm3 psrlw mm1, 2 packuswb mm0, mm1 movq [ecx], mm0%endmacroalign 16Skl_Copy_8x4_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX retalign 16Skl_Copy_8x4_HH_Rnd1_MMX: PROLOG Rounder1_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX retalign 16Skl_Copy_8x8_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX retalign 16Skl_Copy_8x8_HH_Rnd1_MMX: PROLOG Rounder1_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX retalign 16Skl_Copy_16x8_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX ; second column mov ecx, [esp+ 4] ; Dst mov eax, [esp+ 8] ; Src lea ecx, [ecx+8] lea eax, [eax+8] ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX retalign 16Skl_Copy_16x8_HH_Rnd1_MMX: PROLOG Rounder1_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX ; second column mov ecx, [esp+ 4] ; Dst mov eax, [esp+ 8] ; Src lea ecx, [ecx+8] lea eax, [eax+8] ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX retalign 16Skl_HV_Pass_2Taps_MMX:Skl_Copy_16x16_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX ; second column mov ecx, [esp+ 4] ; Dst mov eax, [esp+ 8] ; Src lea ecx, [ecx+8] lea eax, [eax+8] ; preprocess first line movq mm0, [eax] movq mm1, mm0 movq mm2, [eax+1] movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm2, mm0 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX lea ecx,[ecx+edx] COPY_HH_MMX ret;//////////////////////////////////////////////////////////////////////;//;// 8b to 16b transfer ops;//;//////////////////////////////////////////////////////////////////////%macro UPLOAD 2 movq mm0, [eax+%2] movq mm2, mm0 movq mm1, [eax+%2+edx] movq mm3, mm1 punpcklbw mm0, mm7 punpckhbw mm2, mm7 movq [ecx+%1*32], mm0 movq [ecx+%1*32+8], mm2 punpcklbw mm1, mm7 punpckhbw mm3, mm7 movq [ecx+%1*32+16], mm1 movq [ecx+%1*32+24], mm3%endmacroalign 16Skl_Copy_16x8_8To16_MMX: ; PROLOG0 pxor mm7, mm7 UPLOAD 0, 0 UPLOAD 4, 8 lea eax,[eax+2*edx] UPLOAD 1, 0 UPLOAD 5, 8 lea eax,[eax+2*edx] UPLOAD 2, 0 UPLOAD 6, 8 lea eax,[eax+2*edx] UPLOAD 3, 0 UPLOAD 7, 8 retalign 16Skl_Copy_8x8_8To16_MMX: ; 31c PROLOG0 pxor mm7, mm7 UPLOAD 0, 0 lea eax,[eax+2*edx] UPLOAD 1, 0 lea eax,[eax+2*edx] UPLOAD 2, 0 lea eax,[eax+2*edx] UPLOAD 3, 0 ret;//////////////////////////////////////////////////////////////////////;//;// Diffs (8b->16b);//;//////////////////////////////////////////////////////////////////////%macro DIFF 2 movq mm0, [eax+%2] ; Src movq mm1, mm0 movq mm2, [eax+%2+edx] movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -