📄 sad-a.asm
字号:
pxor m4, m4 pxor m3, m3 pxor m2, m2 mov r3d, 15*FENC_STRIDE.vloop: SPLATB m6, r1+r3*2-1, m1 mova m0, [r0+r3] psadbw m0, m7 paddw m4, m0 mova m0, [r0+r3] psadbw m0, m5 paddw m2, m0%if mmsize==8 mova m0, [r0+r3] psadbw m0, m6 paddw m3, m0 mova m0, [r0+r3+8] psadbw m0, m7 paddw m4, m0 mova m0, [r0+r3+8] psadbw m0, m1 paddw m2, m0 psadbw m6, [r0+r3+8] paddw m3, m6%else psadbw m6, [r0+r3] paddw m3, m6%endif add r3d, -FENC_STRIDE jge .vloop%if mmsize==16 pslldq m3, 4 por m3, m2 movhlps m1, m3 paddw m3, m1 movq [r2+0], m3 movhlps m1, m4 paddw m4, m1%else movd [r2+0], m2 movd [r2+4], m3%endif movd [r2+8], m4 RET%endmacroINIT_MMX%define SPLATB SPLATB_MMXINTRA_SAD16 mmxextINIT_XMMINTRA_SAD16 sse2, 8%define SPLATB SPLATB_SSSE3INTRA_SAD16 ssse3, 8;=============================================================================; SAD x3/x4 MMX;=============================================================================%macro SAD_X3_START_1x8P 0 movq mm3, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] psadbw mm0, mm3 psadbw mm1, mm3 psadbw mm2, mm3%endmacro%macro SAD_X3_1x8P 2 movq mm3, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm3 psadbw mm5, mm3 psadbw mm6, mm3 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6%endmacro%macro SAD_X3_START_2x4P 3 movd mm3, [r0] movd %1, [r1] movd %2, [r2] movd %3, [r3] punpckldq mm3, [r0+FENC_STRIDE] punpckldq %1, [r1+r4] punpckldq %2, [r2+r4] punpckldq %3, [r3+r4] psadbw %1, mm3 psadbw %2, mm3 psadbw %3, mm3%endmacro%macro SAD_X3_2x16P 1%if %1 SAD_X3_START_1x8P%else SAD_X3_1x8P 0, 0%endif SAD_X3_1x8P 8, 8 SAD_X3_1x8P FENC_STRIDE, r4 SAD_X3_1x8P FENC_STRIDE+8, r4+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4]%endmacro%macro SAD_X3_2x8P 1%if %1 SAD_X3_START_1x8P%else SAD_X3_1x8P 0, 0%endif SAD_X3_1x8P FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4]%endmacro%macro SAD_X3_2x4P 1%if %1 SAD_X3_START_2x4P mm0, mm1, mm2%else SAD_X3_START_2x4P mm4, mm5, mm6 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6%endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4]%endmacro%macro SAD_X4_START_1x8P 0 movq mm7, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] movq mm3, [r4] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7%endmacro%macro SAD_X4_1x8P 2 movq mm7, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm7 psadbw mm5, mm7 psadbw mm6, mm7 psadbw mm7, [r4+%2] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7%endmacro%macro SAD_X4_START_2x4P 0 movd mm7, [r0] movd mm0, [r1] movd mm1, [r2] movd mm2, [r3] movd mm3, [r4] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm0, [r1+r5] punpckldq mm1, [r2+r5] punpckldq mm2, [r3+r5] punpckldq mm3, [r4+r5] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7%endmacro%macro SAD_X4_INC_2x4P 0 movd mm7, [r0] movd mm4, [r1] movd mm5, [r2] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm4, [r1+r5] punpckldq mm5, [r2+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm0, mm4 paddw mm1, mm5 movd mm4, [r3] movd mm5, [r4] punpckldq mm4, [r3+r5] punpckldq mm5, [r4+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm2, mm4 paddw mm3, mm5%endmacro%macro SAD_X4_2x16P 1%if %1 SAD_X4_START_1x8P%else SAD_X4_1x8P 0, 0%endif SAD_X4_1x8P 8, 8 SAD_X4_1x8P FENC_STRIDE, r5 SAD_X4_1x8P FENC_STRIDE+8, r5+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5]%endmacro%macro SAD_X4_2x8P 1%if %1 SAD_X4_START_1x8P%else SAD_X4_1x8P 0, 0%endif SAD_X4_1x8P FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5]%endmacro%macro SAD_X4_2x4P 1%if %1 SAD_X4_START_2x4P%else SAD_X4_INC_2x4P%endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5]%endmacro%macro SAD_X3_END 0%ifdef UNIX64 movd [r5+0], mm0 movd [r5+4], mm1 movd [r5+8], mm2%else mov r0, r5mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2%endif RET%endmacro%macro SAD_X4_END 0 mov r0, r6mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 movd [r0+12], mm3 RET%endmacro;-----------------------------------------------------------------------------; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,; uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X 3cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2%ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d%endif SAD_X%1_2x%2P 1%rep %3/2-1 SAD_X%1_2x%2P 0%endrep SAD_X%1_END%endmacroSAD_X 3, 16, 16SAD_X 3, 16, 8SAD_X 3, 8, 16SAD_X 3, 8, 8SAD_X 3, 8, 4SAD_X 3, 4, 8SAD_X 3, 4, 4SAD_X 4, 16, 16SAD_X 4, 16, 8SAD_X 4, 8, 16SAD_X 4, 8, 8SAD_X 4, 8, 4SAD_X 4, 4, 8SAD_X 4, 4, 4;=============================================================================; SAD x3/x4 XMM;=============================================================================%macro SAD_X3_START_1x16P_SSE2 0 movdqa xmm3, [r0] movdqu xmm0, [r1] movdqu xmm1, [r2] movdqu xmm2, [r3] psadbw xmm0, xmm3 psadbw xmm1, xmm3 psadbw xmm2, xmm3%endmacro%macro SAD_X3_1x16P_SSE2 2 movdqa xmm3, [r0+%1] movdqu xmm4, [r1+%2] movdqu xmm5, [r2+%2] movdqu xmm6, [r3+%2] psadbw xmm4, xmm3 psadbw xmm5, xmm3 psadbw xmm6, xmm3 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6%endmacro%macro SAD_X3_2x16P_SSE2 1%if %1 SAD_X3_START_1x16P_SSE2%else SAD_X3_1x16P_SSE2 0, 0%endif SAD_X3_1x16P_SSE2 FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4]%endmacro%macro SAD_X3_START_2x8P_SSE2 0 movq xmm7, [r0] movq xmm0, [r1] movq xmm1, [r2] movq xmm2, [r3] movhps xmm7, [r0+FENC_STRIDE] movhps xmm0, [r1+r4] movhps xmm1, [r2+r4] movhps xmm2, [r3+r4] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7%endmacro%macro SAD_X3_2x8P_SSE2 0 movq xmm7, [r0] movq xmm3, [r1] movq xmm4, [r2] movq xmm5, [r3] movhps xmm7, [r0+FENC_STRIDE] movhps xmm3, [r1+r4] movhps xmm4, [r2+r4] movhps xmm5, [r3+r4] psadbw xmm3, xmm7 psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm3 paddw xmm1, xmm4 paddw xmm2, xmm5%endmacro%macro SAD_X4_START_2x8P_SSE2 0 movq xmm7, [r0] movq xmm0, [r1] movq xmm1, [r2] movq xmm2, [r3] movq xmm3, [r4] movhps xmm7, [r0+FENC_STRIDE] movhps xmm0, [r1+r5] movhps xmm1, [r2+r5] movhps xmm2, [r3+r5] movhps xmm3, [r4+r5] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 psadbw xmm3, xmm7%endmacro%macro SAD_X4_2x8P_SSE2 0 movq xmm7, [r0] movq xmm4, [r1] movq xmm5, [r2]%ifdef ARCH_X86_64 movq xmm6, [r3] movq xmm8, [r4] movhps xmm7, [r0+FENC_STRIDE] movhps xmm4, [r1+r5] movhps xmm5, [r2+r5] movhps xmm6, [r3+r5] movhps xmm8, [r4+r5] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 psadbw xmm8, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm8%else movhps xmm7, [r0+FENC_STRIDE] movhps xmm4, [r1+r5] movhps xmm5, [r2+r5] psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 movq xmm6, [r3] movq xmm4, [r4] movhps xmm6, [r3+r5] movhps xmm4, [r4+r5] psadbw xmm6, xmm7 psadbw xmm4, xmm7 paddw xmm2, xmm6 paddw xmm3, xmm4%endif%endmacro%macro SAD_X4_START_1x16P_SSE2 0 movdqa xmm7, [r0] movdqu xmm0, [r1] movdqu xmm1, [r2] movdqu xmm2, [r3] movdqu xmm3, [r4] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 psadbw xmm3, xmm7%endmacro%macro SAD_X4_1x16P_SSE2 2 movdqa xmm7, [r0+%1] movdqu xmm4, [r1+%2] movdqu xmm5, [r2+%2] movdqu xmm6, [r3+%2]%ifdef ARCH_X86_64 movdqu xmm8, [r4+%2] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 psadbw xmm8, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm8%else psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm4 psadbw xmm6, xmm7 movdqu xmm4, [r4+%2] paddw xmm1, xmm5 psadbw xmm4, xmm7 paddw xmm2, xmm6 paddw xmm3, xmm4%endif%endmacro%macro SAD_X4_2x16P_SSE2 1%if %1 SAD_X4_START_1x16P_SSE2%else SAD_X4_1x16P_SSE2 0, 0%endif SAD_X4_1x16P_SSE2 FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5]%endmacro%macro SAD_X3_2x8P_SSE2 1%if %1 SAD_X3_START_2x8P_SSE2%else SAD_X3_2x8P_SSE2%endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4]%endmacro%macro SAD_X4_2x8P_SSE2 1%if %1 SAD_X4_START_2x8P_SSE2%else SAD_X4_2x8P_SSE2%endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5]%endmacro%macro SAD_X3_END_SSE2 0 movhlps xmm4, xmm0 movhlps xmm5, xmm1 movhlps xmm6, xmm2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6%ifdef UNIX64 movd [r5+0], xmm0 movd [r5+4], xmm1 movd [r5+8], xmm2%else mov r0, r5mp movd [r0+0], xmm0 movd [r0+4], xmm1 movd [r0+8], xmm2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -