📄 pixel-sse2.asm
字号:
movdqa xmm5, xmm1 punpcklqdq xmm1, xmm3 ; rows 1 and 3 punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3%endmacro%macro SUM1x8_SSE2 3 ; 01 junk sum pxor %2, %2 psubw %2, %1 pmaxsw %1, %2 paddusw %3, %1%endmacro%macro SUM4x4_SSE2 4 ; 02 13 junk sum pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 pxor %3, %3 psubw %3, %2 pmaxsw %2, %3 paddusw %4, %1 paddusw %4, %2%endmacro;;; two SUM4x4_SSE2 running side-by-side%macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum pxor %3, %3 pxor %6, %6 psubw %3, %1 psubw %6, %4 pmaxsw %1, %3 pmaxsw %4, %6 pxor %3, %3 pxor %6, %6 psubw %3, %2 psubw %6, %5 pmaxsw %2, %3 pmaxsw %5, %6 paddusw %1, %2 paddusw %4, %5 paddusw %7, %1 paddusw %7, %4%endmacro%macro SUM_MM_SSE2 2 ; sum junk movdqa %2, %1 psrldq %1, 2 paddusw %1, %2 pand %1, [pd_0000ffff GLOBAL] movdqa %2, %1 psrldq %1, 4 paddd %1, %2 movdqa %2, %1 psrldq %1, 8 paddd %1, %2 movd eax,%1%endmacro%macro SATD_TWO_SSE2 0 LOAD4x8_DIFF_SSE2 HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3 TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5 HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6%endmacro%macro SATD_START 0; mov rdi, rdi ; pix1 movsxd rsi, esi ; stride1; mov rdx, rdx ; pix2 movsxd rcx, ecx ; stride2 pxor xmm6, xmm6%endmacro%macro SATD_END 0 psrlw xmm6, 1 SUM_MM_SSE2 xmm6, xmm7 ret%endmacroALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_16x16_sse2: SATD_START mov r8, rdi mov r9, rdx SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 lea rdi, [r8+8] lea rdx, [r9+8] SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x16_sse2: SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_16x8_sse2: SATD_START mov r8, rdi mov r9, rdx SATD_TWO_SSE2 SATD_TWO_SSE2 lea rdi, [r8+8] lea rdx, [r9+8] SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x8_sse2: SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x4_sse2: SATD_START SATD_TWO_SSE2 SATD_END%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2] movq %1, %3 movq %2, %4 punpcklbw %1, %2 punpcklbw %2, %2 psubw %1, %2%endmacro%macro SBUTTERFLY 5 mov%1 %5, %3 punpckl%2 %3, %4 punpckh%2 %5, %4%endmacro;-----------------------------------------------------------------------------; input ABCDEFGH output AFHDTECB ;-----------------------------------------------------------------------------%macro TRANSPOSE8x8 9 SBUTTERFLY dqa, wd, %1, %2, %9 SBUTTERFLY dqa, wd, %3, %4, %2 SBUTTERFLY dqa, wd, %5, %6, %4 SBUTTERFLY dqa, wd, %7, %8, %6 SBUTTERFLY dqa, dq, %1, %3, %8 SBUTTERFLY dqa, dq, %9, %2, %3 SBUTTERFLY dqa, dq, %5, %7, %2 SBUTTERFLY dqa, dq, %4, %6, %7 SBUTTERFLY dqa, qdq, %1, %5, %6 SBUTTERFLY dqa, qdq, %9, %4, %5 SBUTTERFLY dqa, qdq, %8, %2, %4 SBUTTERFLY dqa, qdq, %3, %7, %2%endmacro%macro SUMSUB_BADC 4 paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3%endmacro%macro HADAMARD1x8 8 SUMSUB_BADC %1, %5, %2, %6 SUMSUB_BADC %3, %7, %4, %8 SUMSUB_BADC %1, %3, %2, %4 SUMSUB_BADC %5, %7, %6, %8 SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %5, %6, %7, %8%endmacroALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_sa8d_8x8_sse2: lea r10, [3*parm2q] lea r11, [3*parm4q] LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q] LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q] LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q] LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11] lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q] LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q] LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q] LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11] HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 pxor xmm10, xmm10 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 psrlw xmm10, 1 SUM_MM_SSE2 xmm10, xmm0 add r8d, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 retALIGN 16;-----------------------------------------------------------------------------; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------;; violates calling conventionx264_pixel_sa8d_16x16_sse2: xor r8d, r8d call x264_pixel_sa8d_8x8_sse2 ; pix[0] lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride] lea r10, [3*parm2q-2] lea r11, [3*parm4q-2] shl r10, 2 shl r11, 2 sub parm1q, r10 sub parm3q, r11 call x264_pixel_sa8d_8x8_sse2 ; pix[8] lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8] mov eax, r8d add eax, 1 shr eax, 1 retALIGN 16;-----------------------------------------------------------------------------; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res );-----------------------------------------------------------------------------x264_intra_sa8d_x3_8x8_core_sse2: ; 8x8 hadamard pxor xmm4, xmm4 movq xmm0, [parm1q+0*FENC_STRIDE] movq xmm7, [parm1q+1*FENC_STRIDE] movq xmm6, [parm1q+2*FENC_STRIDE] movq xmm3, [parm1q+3*FENC_STRIDE] movq xmm5, [parm1q+4*FENC_STRIDE] movq xmm1, [parm1q+5*FENC_STRIDE] movq xmm8, [parm1q+6*FENC_STRIDE] movq xmm2, [parm1q+7*FENC_STRIDE] punpcklbw xmm0, xmm4 punpcklbw xmm7, xmm4 punpcklbw xmm6, xmm4 punpcklbw xmm3, xmm4 punpcklbw xmm5, xmm4 punpcklbw xmm1, xmm4 punpcklbw xmm8, xmm4 punpcklbw xmm2, xmm4 HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2 TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ; dc movzx edi, word [parm2q+0] add di, word [parm2q+16] add edi, 8 and edi, -16 shl edi, 2 pxor xmm15, xmm15 movdqa xmm8, xmm2 movdqa xmm9, xmm3 movdqa xmm10, xmm4 movdqa xmm11, xmm5 SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15 movdqa xmm8, xmm6 movdqa xmm9, xmm7 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15 movdqa xmm8, xmm1 SUM1x8_SSE2 xmm8, xmm10, xmm15 movdqa xmm14, xmm15 ; 7x8 sum movdqa xmm8, [parm2q+0] ; left edge movd xmm9, edi psllw xmm8, 3 psubw xmm8, xmm0 psubw xmm9, xmm0 SUM1x8_SSE2 xmm8, xmm10, xmm14 SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum punpcklwd xmm0, xmm1 punpcklwd xmm2, xmm3 punpcklwd xmm4, xmm5 punpcklwd xmm6, xmm7 punpckldq xmm0, xmm2 punpckldq xmm4, xmm6 punpcklqdq xmm0, xmm4 ; transpose movdqa xmm1, [parm2q+16] ; top edge movdqa xmm2, xmm15 psllw xmm1, 3 psrldq xmm2, 2 ; 8x7 sum psubw xmm0, xmm1 ; 8x1 sum SUM1x8_SSE2 xmm0, xmm1, xmm2 SUM_MM_SSE2 xmm14, xmm3 add eax, 2 shr eax, 2 mov [parm3q+4], eax ; i8x8_h sa8d SUM_MM_SSE2 xmm15, xmm4 add eax, 2 shr eax, 2 mov [parm3q+8], eax ; i8x8_dc sa8d SUM_MM_SSE2 xmm2, xmm5 add eax, 2 shr eax, 2 mov [parm3q+0], eax ; i8x8_v sa8d ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -