📄 skl_img_x86.asm
字号:
pop ebx paddusw mm6,mm0 paddusw mm6,mm1 movd eax, mm6 retalign 16Skl_SAD_8x16_SSE: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS push ebx lea ebx, [ecx+ecx] pxor mm5, mm5 ; this is a NOP pxor mm6, mm6 ; accum2 SAD_8x8_SSE SAD_8x8_SSE SAD_8x8_SSE SAD_8x8_SSE SAD_8x8_SSE SAD_8x8_SSE SAD_8x8_SSE movq mm0, [eax] psadbw mm0, [edx] movq mm1, [eax+ecx] psadbw mm1, [edx+ecx] pop ebx paddusw mm6,mm0 paddusw mm6,mm1 movd eax, mm6 ret;//////////////////////////////////////////////////////////////////////; Skl_SAD_4x4_SSE;//////////////////////////////////////////////////////////////////////%macro SAD_4x4_SSE 0 movd mm0, [eax] movd mm1, [edx] psadbw mm0, mm1 movd mm1, [eax+ecx] add eax, ebx paddusw mm6,mm0 movd mm0, [edx+ecx] lea edx, [edx+ebx] psadbw mm1, mm0 paddusw mm6,mm1%endmacroalign 16Skl_SAD_4x4_SSE: ; mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS push ebx lea ebx, [ecx+ecx] pxor mm6, mm6 ; accum2 pxor mm0, mm0 pxor mm1, mm1 SAD_4x4_SSE movd mm0, [eax] movd mm1, [edx] psadbw mm0, mm1 movd mm1, [eax+ecx] pop ebx paddusw mm6,mm0 movd mm0, [edx+ecx] psadbw mm1, mm0 paddusw mm6,mm1 movd eax, mm6 retalign 16Skl_SAD_4x8_SSE: ; mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS push ebx lea ebx, [ecx+ecx] pxor mm6, mm6 ; accum2 pxor mm0, mm0 pxor mm1, mm1 SAD_4x4_SSE SAD_4x4_SSE SAD_4x4_SSE movd mm0, [eax] movd mm1, [edx] psadbw mm0, mm1 movd mm1, [eax+ecx] pop ebx paddusw mm6,mm0 movd mm0, [edx+ecx] psadbw mm1, mm0 paddusw mm6,mm1 movd eax, mm6 ret;//////////////////////////////////////////////////////////////////////; Skl_Mean_16x16_SSE;//////////////////////////////////////////////////////////////////////%macro MEAN_16x16_SSE 0 movq mm0, [eax] psadbw mm0, mm7 movq mm1, [eax+8] psadbw mm1, mm7 paddusw mm5, mm0 add eax, ecx paddusw mm6, mm1%endmacroalign 16Skl_Mean_16x16_SSE: ; 97c mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor mm5, mm5 ; accum pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE paddusw mm6,mm5 movd eax,mm6 shr eax, 8 ret;//////////////////////////////////////////////////////////////////////; Skl_Mean_8x8_SSE;//////////////////////////////////////////////////////////////////////%macro MEAN_8x8_SSE 0 movq mm0, [eax] movq mm1, [eax+ecx] psadbw mm0, mm7 add eax, edx psadbw mm1, mm7 paddw mm6, mm0 paddw mm6, mm1%endmacroalign 16Skl_Mean_8x8_SSE: ; 34c mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero lea edx, [ecx+ecx] pxor mm6, mm6 ; this is a NOP MEAN_8x8_SSE MEAN_8x8_SSE MEAN_8x8_SSE MEAN_8x8_SSE COLLAPSE_MMX shr eax,6 ret;//////////////////////////////////////////////////////////////////////; Skl_Mean_4x4_SSE;//////////////////////////////////////////////////////////////////////%macro MEAN_4x4_SSE 0 movd mm0, [eax] movd mm1, [eax+ecx] psadbw mm0, mm7 add eax, edx psadbw mm1, mm7 paddw mm6, mm0 paddw mm6, mm1%endmacroalign 16Skl_Mean_4x4_SSE: ; mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero pxor mm0, mm0 pxor mm1, mm1 lea edx, [ecx+ecx] pxor mm6, mm6 ; this is a NOP MEAN_4x4_SSE MEAN_4x4_SSE COLLAPSE_4_MMX shr eax,4 ret;//////////////////////////////////////////////////////////////////////; Skl_Abs_Dev_16x16_SSE;//////////////////////////////////////////////////////////////////////align 16Skl_Abs_Dev_16x16_SSE: ; 191c mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor mm5, mm5 ; accum pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE mov eax, [esp+ 4] ; Src paddusw mm5,mm6 pxor mm6, mm6 ; accum #1 psrlw mm5, 8 ; => Mean pshufw mm7, mm5, 0 ; replicate Mean pxor mm5, mm5 ; accum #2 packuswb mm7,mm7 MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE paddusw mm6, mm5 movd eax, mm6 ret;//////////////////////////////////////////////////////////////////////; Skl_Sqr_Dev_16x16_SSE;//////////////////////////////////////////////////////////////////////%macro SQR_DEV_16x16_SSE 0 movq mm0, [eax] movq mm2, mm0 movq mm1, [eax+8] movq mm3, mm1 psadbw mm0, mm7 lea eax, [eax+ecx] psadbw mm1, mm7 paddw mm6, mm0 movq mm0, mm2 paddw mm6, mm1 movq mm1, mm3 punpcklbw mm0, mm7 punpcklbw mm1, mm7 punpckhbw mm2, mm7 punpckhbw mm3, mm7 pmaddwd mm0, mm0 pmaddwd mm1, mm1 pmaddwd mm2, mm2 pmaddwd mm3, mm3 paddd mm5, mm0 paddd mm5, mm1 paddd mm5, mm2 paddd mm5, mm3%endmacroalign 16Skl_Sqr_Dev_16x16_SSE: ; 237c mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor mm5, mm5 ; accum for sqr pxor mm6, mm6 ; accum for mean pxor mm7, mm7 ; zero SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE SQR_DEV_16x16_SSE ; we can't use a *signed* 'pmulhw mm6,mm6' here => pmaddw instead psrlq mm6, 8 pmaddwd mm6, mm6 movq mm7, mm5 psrlq mm5, 32 paddd mm5, mm7 psrld mm5,8 ; (Sqr)>>8 psubd mm5, mm6 movd eax,mm5 ret;//////////////////////////////////////////////////////////////////////;//;// SSE2 impl;//;//////////////////////////////////////////////////////////////////////;//////////////////////////////////////////////////////////////////////; Skl_SAD_16x16_SSE2;//////////////////////////////////////////////////////////////////////%macro SAD_16x16_SSE2 0 movdqu xmm0, [edx] movdqu xmm1, [edx+ecx] lea edx,[edx+2*ecx] movdqa xmm2, [eax] movdqa xmm3, [eax+ecx] lea eax,[eax+2*ecx] psadbw xmm0, xmm2 paddusw xmm6,xmm0 psadbw xmm1, xmm3 paddusw xmm6,xmm1%endmacroalign 16Skl_SAD_16x16_SSE2: mov eax, [esp+ 4] ; Src1 (assumed aligned) mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor xmm6, xmm6 ; accum SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 pshufd xmm5, xmm6, 00000010b paddusw xmm6, xmm5 pextrw eax, xmm6, 0 retalign 16Skl_SAD_16x8_Field_SSE2: mov eax, [esp+ 4] ; Src1 (assumed aligned) mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS lea ecx, [ecx+ecx] ; 2.BpS pxor xmm6, xmm6 ; accum SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 pshufd xmm5, xmm6, 00000010b paddusw xmm6, xmm5 pextrw eax, xmm6, 0 ret;//////////////////////////////////////////////////////////////////////; Skl_SAD_16x7_Self_SSE2;//////////////////////////////////////////////////////////////////////%macro SAD_16x7_SSE2 0 movdqa xmm0, [eax] psadbw xmm0, [edx] movdqa xmm1, [eax+ecx] lea eax, [eax+2*ecx] psadbw xmm1, [edx+ecx] paddusw xmm6,xmm0 lea edx, [edx+2*ecx] paddusw xmm6,xmm1%endmacroalign 16Skl_SAD_16x7_Self_SSE2: mov eax, [esp+ 4] ; Src (assumed aligned) mov ecx, [esp+ 8] ; BpS lea edx, [eax+ecx] pxor xmm7, xmm7 ; this is a NOP pxor xmm6, xmm6 ; accum2 SAD_16x7_SSE2 SAD_16x7_SSE2 SAD_16x7_SSE2 movdqa xmm0, [eax] psadbw xmm0, [edx] paddusw xmm6,xmm0 pshufd xmm5, xmm6, 00000010b paddusw xmm6, xmm5 pextrw eax, xmm6, 0 ret;//////////////////////////////////////////////////////////////////////; Skl_Mean_16x16_SSE2;//////////////////////////////////////////////////////////////////////%macro MEAN_16x16_SSE2 0 movdqu xmm0, [eax] movdqu xmm1, [eax+ecx] lea eax, [eax+2*ecx] ; + 2*BpS psadbw xmm0, xmm7 paddusw xmm6, xmm0 psadbw xmm1, xmm7 paddusw xmm6, xmm1%endmacroalign 16Skl_Mean_16x16_SSE2: mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor xmm6, xmm6 ; accum pxor xmm7, xmm7 ; zero MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 pshufd xmm5, xmm6, 10b paddusw xmm6, xmm5 pextrw eax, xmm6, 0 shr eax, 8 ret;//////////////////////////////////////////////////////////////////////; Skl_Abs_Dev_16x16_SSE2;//////////////////////////////////////////////////////////////////////align 16Skl_Abs_Dev_16x16_SSE2: mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor xmm6, xmm6 ; accum pxor xmm7, xmm7 ; zero MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 mov eax, [esp+ 4] ; Src pshufd xmm7, xmm6, 10b paddusw xmm7, xmm6 pxor xmm6, xmm6 ; zero accum psrlw xmm7, 8 ; => Mean pshuflw xmm7, xmm7, 0 ; replicate Mean packuswb xmm7, xmm7 pshufd xmm7, xmm7, 00000000b MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 pshufd xmm7, xmm6, 10b paddusw xmm7, xmm6 pextrw eax, xmm7, 0 ret;//////////////////////////////////////////////////////////////////////; Skl_Sqr_16x16_SSE2;//////////////////////////////////////////////////////////////////////%macro SQR_16x16_SSE2 0 movdqu xmm0, [eax] movdqu xmm1, [eax+ecx] lea eax,[eax+2*ecx] movdqa xmm2, xmm0 movdqa xmm3, xmm1 punpcklbw xmm0, xmm6 punpcklbw xmm1, xmm6 punpckhbw xmm2, xmm6 punpckhbw xmm3, xmm6 pmaddwd xmm0, xmm0 pmaddwd xmm1, xmm1 pmaddwd xmm2, xmm2 pmaddwd xmm3, xmm3 paddd xmm7, xmm0 paddd xmm7, xmm1 paddd xmm7, xmm2 paddd xmm7, xmm3%endmacroalign 16Skl_Sqr_16x16_SSE2: ; 287c mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS pxor xmm7, xmm7 ; accum pxor xmm6, xmm6 ; zero SQR_16x16_SSE2 SQR_16x16_SSE2 SQR_16x16_SSE2 SQR_16x16_SSE2 SQR_16x16_SSE2 SQR_16x16_SSE2 SQR_16x16_SSE2 SQR_16x16_SSE2 pshufd xmm6, xmm7, 1110b paddd xmm7, xmm6 pshufd xmm6, xmm7, 01b paddd xmm7, xmm6 movd eax, xmm7 shr eax, 8 ret;//////////////////////////////////////////////////////////////////////
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -