📄 pixel-a.asm
字号:
paddw m0, m1 pshufw m1, m0, 10110001b paddw m0, m1 movd eax, m0 and eax, 0xffff RET%endmacro; FIXME avoid the spilling of regs to hold 3*stride.; for small blocks on x86_32, modify pixel pointer instead.;-----------------------------------------------------------------------------; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------INIT_MMXcglobal x264_pixel_satd_16x4_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 SATD_4x4_MMX m2, 8, 0 paddw m0, m1 SATD_4x4_MMX m1, 12, 0 paddw m0, m2 paddw m0, m1 retcglobal x264_pixel_satd_8x8_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1x264_pixel_satd_8x4_internal_mmxext: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 retcglobal x264_pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0%rep 3 call x264_pixel_satd_16x4_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3]%endrep call x264_pixel_satd_16x4_internal_mmxext HADDUW m0, m1 movd eax, m0 RETcglobal x264_pixel_satd_16x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_16x4_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_16x4_internal_mmxext SATD_END_MMXcglobal x264_pixel_satd_8x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_8x8_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_8x8_internal_mmxext SATD_END_MMXcglobal x264_pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_8x8_internal_mmxext SATD_END_MMXcglobal x264_pixel_satd_8x4_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_8x4_internal_mmxext SATD_END_MMXcglobal x264_pixel_satd_4x8_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMXcglobal x264_pixel_satd_4x4_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX%macro SATD_START_SSE2 3%ifnidn %1, sse2 mova %3, [hmul_8p GLOBAL]%endif lea r4, [3*r1] lea r5, [3*r3] pxor %2, %2%endmacro%macro SATD_END_SSE2 2 HADDW %2, m7 movd eax, %2 RET%endmacro%macro BACKUP_POINTERS 0%ifdef ARCH_X86_64 mov r10, r0 mov r11, r2%endif%endmacro%macro RESTORE_AND_INC_POINTERS 0%ifdef ARCH_X86_64 lea r0, [r10+8] lea r2, [r11+8]%else mov r0, r0mp mov r2, r2mp add r0, 8 add r2, 8%endif%endmacro;-----------------------------------------------------------------------------; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------%macro SATDS_SSE2 1INIT_XMM%ifnidn %1, sse2cglobal x264_pixel_satd_4x4_%1, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p GLOBAL] LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 HADAMARD 0, sumsub, 0, 1, 2, 3 HADAMARD 4, sumsub, 0, 1, 2, 3 HADAMARD 1, amax, 0, 1, 2, 3 HADDW m0, m1 movd eax, m0 RET%endifcglobal x264_pixel_satd_4x8_%1, 4, 6, 8 SATD_START_MMX%ifnidn %1, sse2 mova m7, [hmul_4p GLOBAL]%endif movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] add r2, r5 movd m0, [r0] movd m1, [r0+r1] movd m2, [r0+2*r1] add r0, r4 movd m3, [r2+r3] JDUP m4, m3 movd m3, [r0+r1] JDUP m0, m3 movd m3, [r2+2*r3] JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 DIFFOP 0, 4, 1, 5, 7 movd m5, [r2] add r2, r5 movd m3, [r0] add r0, r4 movd m4, [r2] JDUP m6, m4 movd m4, [r0] JDUP m2, m4 movd m4, [r2+r3] JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 DIFFOP 2, 6, 3, 5, 7 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap HADDW m6, m1 movd eax, m6 RETcglobal x264_pixel_satd_8x8_internal_%1 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6x264_pixel_satd_8x4_internal_%1: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 ret%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the samecglobal x264_pixel_satd_16x4_internal_%1 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10 SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 retcglobal x264_pixel_satd_16x8_%1, 4,6,12 SATD_START_SSE2 %1, m10, m7%ifidn %1, sse2 mova m7, [pw_00ff GLOBAL]%endif jmp x264_pixel_satd_16x8_internal_%1cglobal x264_pixel_satd_16x16_%1, 4,6,12 SATD_START_SSE2 %1, m10, m7%ifidn %1, sse2 mova m7, [pw_00ff GLOBAL]%endif call x264_pixel_satd_16x4_internal_%1 call x264_pixel_satd_16x4_internal_%1x264_pixel_satd_16x8_internal_%1: call x264_pixel_satd_16x4_internal_%1 call x264_pixel_satd_16x4_internal_%1 SATD_END_SSE2 %1, m10%elsecglobal x264_pixel_satd_16x8_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6cglobal x264_pixel_satd_16x16_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 call x264_pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS call x264_pixel_satd_8x8_internal_%1 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6%endifcglobal x264_pixel_satd_8x16_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 call x264_pixel_satd_8x8_internal_%1 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6cglobal x264_pixel_satd_8x8_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6cglobal x264_pixel_satd_8x4_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 call x264_pixel_satd_8x4_internal_%1 SATD_END_SSE2 %1, m6%endmacro ; SATDS_SSE2%macro SA8D 1%ifdef ARCH_X86_64;-----------------------------------------------------------------------------; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_sa8d_8x8_internal_%1 lea r10, [r0+4*r1] lea r11, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11%ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax%else ; non-sse2 HADAMARD4_V m0, m1, m2, m8, m6 HADAMARD4_V m4, m5, m3, m9, m6 SUMSUB_BADC m0, m4, m1, m5, m6 HADAMARD 2, sumsub, 0, 4, 6, 11 HADAMARD 2, sumsub, 1, 5, 6, 11 SUMSUB_BADC m2, m3, m8, m9, m6 HADAMARD 2, sumsub, 2, 3, 6, 11 HADAMARD 2, sumsub, 8, 9, 6, 11 HADAMARD 1, amax, 0, 4, 6, 11 HADAMARD 1, amax, 1, 5, 6, 4 HADAMARD 1, amax, 2, 3, 6, 4 HADAMARD 1, amax, 8, 9, 6, 4%endif paddw m0, m1 paddw m0, m2 paddw m0, m8 SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1 retcglobal x264_pixel_sa8d_8x8_%1, 4,6,12 lea r4, [3*r1] lea r5, [3*r3]%ifnidn %1, sse2 mova m7, [hmul_8p GLOBAL]%endif call x264_pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 RETcglobal x264_pixel_sa8d_16x16_%1, 4,6,12 lea r4, [3*r1] lea r5, [3*r3]%ifnidn %1, sse2 mova m7, [hmul_8p GLOBAL]%endif call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] add r2, 8 add r0, 8 mova m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] paddusw m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] sub r2, 8 sub r0, 8 paddusw m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride] paddusw m0, m10 HADDUW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 RET%else ; ARCH_X86_32%ifnidn %1, mmxextcglobal x264_pixel_sa8d_8x8_internal_%1 %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36]%ifidn %1, sse2 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 HADAMARD4_2D 4, 5, 6, 7, 3 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax movdqa m3, spill0 paddw m0, m1 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax%else ; non-sse2 mova m7, [hmul_8p GLOBAL] LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 ; could do first HADAMARD4_V here to save spilling later ; surprisingly, not a win on conroe or even p4 mova spill0, m2 mova spill1, m3 mova spill2, m1 SWAP 1, 7 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 HADAMARD4_V m4, m5, m6, m7, m3 mova m1, spill2 mova m2, spill0 mova m3, spill1 mova spill0, m6 mova spill1, m7 HADAMARD4_V m0, m1, m2, m3, m7 SUMSUB_BADC m0, m4, m1, m5, m7 HADAMARD 2, sumsub, 0, 4, 7, 6 HADAMARD 2, sumsub, 1, 5, 7, 6 HADAMARD 1, amax, 0, 4, 7, 6 HADAMARD 1, amax, 1, 5, 7, 6 mova m6, spill0 mova m7, spill1 paddw m0, m1 SUMSUB_BADC m2, m6, m3, m7, m4 HADAMARD 2, sumsub, 2, 6, 4, 5 HADAMARD 2, sumsub, 3, 7, 4, 5 HADAMARD 1, amax, 2, 6, 4, 5 HADAMARD 1, amax, 3, 7, 4, 5%endif ; sse2/non-sse2 paddw m0, m2 paddw m0, m3 ret%endif ; ifndef mmxextcglobal x264_pixel_sa8d_8x8_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RETcglobal x264_pixel_sa8d_16x16_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1%ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3]%endif mova [esp+48], m0 call x264_pixel_sa8d_8x8_internal_%1 mov r0, [r6+20] mov r2, [r6+28] add r0, 8 add r2, 8 paddusw m0, [esp+48] mova [esp+48], m0 call x264_pixel_sa8d_8x8_internal_%1%ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3]%endif%if mmsize == 16 paddusw m0, [esp+48]%endif mova [esp+64-mmsize], m0 call x264_pixel_sa8d_8x8_internal_%1 paddusw m0, [esp+64-mmsize]%if mmsize == 16 HADDUW m0, m1%else mova m2, [esp+48] pxor m7, m7 mova m1, m0 mova m3, m2 punpcklwd m0, m7 punpckhwd m1, m7 punpcklwd m2, m7 punpckhwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m2 HADDD m0, m1%endif movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET%endif ; !ARCH_X86_64%endmacro ; SA8D;=============================================================================; INTRA SATD;=============================================================================%macro INTRA_SA8D_SSE2 1%ifdef ARCH_X86_64INIT_XMM;-----------------------------------------------------------------------------; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res );-----------------------------------------------------------------------------cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] movq m1, [r0+1*FENC_STRIDE] movq m2, [r0+2*FENC_STRIDE] movq m3, [r0+3*FENC_STRIDE] movq m4, [r0+4*FENC_STRIDE] movq m5, [r0+5*FENC_STRIDE] movq m6, [r0+6*FENC_STRIDE] movq m7, [r0+7*FENC_STRIDE] punpcklbw m0, m8 punpcklbw m1, m8 punpcklbw m2, m8 punpcklbw m3, m8 punpcklbw m4, m8 punpcklbw m5, m8 punpcklbw m6, m8 punpcklbw m7, m8 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 ; dc movzx r0d, word [r1+0] add r0w, word [r1+16] add r0d, 8 and r0d, -16 shl r0d, 2 pxor m15, m15 movdqa m8, m2 movdqa m9, m3 movdqa m10, m4 movdqa m11, m5 ABS4 m8, m9, m10, m11, m12, m13 paddusw m8, m10 paddusw m9, m11%ifidn %1, ssse3 pabsw m10, m6 pabsw m11, m7 pabsw m15, m1%else movdqa m10, m6 movdqa m11, m7 movdqa m15, m1 ABS2 m10, m11, m13, m14 ABS1 m15, m13%endif paddusw m10, m11 paddusw m8, m9 paddusw m15, m10 paddusw m15, m8 movdqa m14, m15 ; 7x8 sum movdqa m8, [r1+0] ; left edge movd m9, r0d psllw m8, 3 psubw m8, m0 psubw m9, m0 ABS1 m8, m10 ABS1 m9, m11 ; 1x8 sum paddusw m14, m8 paddusw m15, m9 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 punpcklwd m6, m7 punpckldq m0, m2 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose movdqa m1, [r1+16] ; top edge movdqa m2, m15 psllw m1, 3 psrldq m2, 2 ; 8x7 sum psubw m0, m1 ; 8x1 sum ABS1 m0, m1 paddusw m2, m0 ; 3x HADDW movdqa m7, [pw_1 GLOBAL] pmaddwd m2, m7 pmaddwd m14, m7 pmaddwd m15, m7 movdqa m3, m2 punpckldq m2, m14 punpckhdq m3, m14 pshufd m5, m15, 0xf5 paddd m2, m3 paddd m5, m15 movdqa m3, m2 punpcklqdq m2, m5 punpckhqdq m3, m5 pavgw m3, m2 pxor m0, m0 pavgw m3, m0 movq [r2], m3 ; i8x8_v, i8x8_h psrldq m3, 8 movd [r2+8], m3 ; i8x8_dc RET%endif ; ARCH_X86_64%endmacro ; INTRA_SA8D_SSE2; in: r0 = fenc; out: m0..m3 = hadamard coefsINIT_MMXALIGN 16load_hadamard: pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] movd m1, [r0+1*FENC_STRIDE] movd m2, [r0+2*FENC_STRIDE] movd m3, [r0+3*FENC_STRIDE] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_2D 0, 1, 2, 3, 4 SAVE_MM_PERMUTATION load_hadamard ret%macro SCALAR_SUMSUB 4 add %1, %2 add %3, %4 add %2, %2 add %4, %4 sub %2, %1 sub %4, %3%endmacro%macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp%ifnidn %1, 0 shl %1d, 5 ; log(FDEC_STRIDE)%endif movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE] movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE] movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE] movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]%ifnidn %1, 0 shr %1d, 5%endif SCALAR_SUMSUB %2d, %3d, %4d, %5d
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -