📄 pixel-a.asm
字号:
SCALAR_SUMSUB %2d, %4d, %3d, %5d mov [left_1d+2*%1+0], %2w mov [left_1d+2*%1+2], %3w mov [left_1d+2*%1+4], %4w mov [left_1d+2*%1+6], %5w%endmacro%macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp movzx %2d, byte [r1+%1-FDEC_STRIDE+0] movzx %3d, byte [r1+%1-FDEC_STRIDE+1] movzx %4d, byte [r1+%1-FDEC_STRIDE+2] movzx %5d, byte [r1+%1-FDEC_STRIDE+3] SCALAR_SUMSUB %2d, %3d, %4d, %5d SCALAR_SUMSUB %2d, %4d, %3d, %5d mov [top_1d+2*%1+0], %2w mov [top_1d+2*%1+2], %3w mov [top_1d+2*%1+4], %4w mov [top_1d+2*%1+6], %5w%endmacro%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b paddw %1, %4 paddw %2, %5 paddw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b %8 %1, %4 %8 %2, %5 %8 %3, %6%endmacro%macro CLEAR_SUMS 0%ifdef ARCH_X86_64 mov qword [sums+0], 0 mov qword [sums+8], 0 mov qword [sums+16], 0%else pxor m7, m7 movq [sums+0], m7 movq [sums+8], m7 movq [sums+16], m7%endif%endmacro; in: m1..m3; out: m7; clobber: m4..m6%macro SUM3x4 1%ifidn %1, ssse3 pabsw m4, m1 pabsw m5, m2 pabsw m7, m3 paddw m4, m5%else movq m4, m1 movq m5, m2 ABS2 m4, m5, m6, m7 movq m7, m3 paddw m4, m5 ABS1 m7, m6%endif paddw m7, m4%endmacro; in: m0..m3 (4x4), m7 (3x4); out: m0 v, m4 h, m5 dc; clobber: m6%macro SUM4x3 3 ; dc, left, top movq m4, %2 movd m5, %1 psllw m4, 2 psubw m4, m0 psubw m5, m0 punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ; transpose movq m1, %3 psllw m1, 2 psubw m0, m1 ABS2 m4, m5, m2, m3 ; 1x4 sum ABS1 m0, m1 ; 4x1 sum%endmacro%macro INTRA_SATDS_MMX 1INIT_MMX;-----------------------------------------------------------------------------; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res );-----------------------------------------------------------------------------cglobal x264_intra_satd_x3_4x4_%1, 2,6%ifdef ARCH_X86_64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 %define t0 r10%else ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned SUB esp, 16 %define top_1d esp+8 %define left_1d esp %define t0 r2%endif call load_hadamard SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5 mov t0d, r0d SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5 lea t0d, [t0d + r0d + 4] and t0d, -8 shl t0d, 1 ; dc SUM3x4 %1 SUM4x3 t0d, [left_1d], [top_1d] paddw m4, m7 paddw m5, m7 movq m1, m5 psrlq m1, 16 ; 4x3 sum paddw m0, m1 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw%ifndef ARCH_X86_64 mov r2, r2mp%endif movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd%ifndef ARCH_X86_64 ADD esp, 16%endif RET%ifdef ARCH_X86_64 %define t0 r10 %define t2 r11%else %define t0 r0 %define t2 r2%endif;-----------------------------------------------------------------------------; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res );-----------------------------------------------------------------------------cglobal x264_intra_satd_x3_16x16_%1, 0,7%ifdef ARCH_X86_64 %assign stack_pad 88%else %assign stack_pad 88 + ((stack_offset+88+4)&15)%endif ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad%define sums rsp+64 ; size 24%define top_1d rsp+32 ; size 32%define left_1d rsp ; size 32 movifnidn r1, r1mp CLEAR_SUMS ; 1D hadamards xor t2d, t2d mov t0d, 12.loop_edge: SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6 add t2d, r3d SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6 add t2d, r3d sub t0d, 4 jge .loop_edge shr t2d, 1 add t2d, 8 and t2d, -16 ; dc ; 2D hadamards movifnidn r0, r0mp xor r3d, r3d.loop_y: xor r4d, r4d.loop_x: call load_hadamard SUM3x4 %1 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+0] ; i16x16_v satd paddw m4, [sums+8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd movq [sums+0], m0 movq [sums+8], m4 movq [sums+16], m5 add r0, 4 inc r4d cmp r4d, 4 jl .loop_x add r0, 4*FENC_STRIDE-16 inc r3d cmp r3d, 4 jl .loop_y; horizontal sum movifnidn r2, r2mp movq m2, [sums+16] movq m1, [sums+8] movq m0, [sums+0] movq m7, m2 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 paddd m0, m2 psubd m0, m7 movd [r2+8], m2 ; i16x16_dc satd movd [r2+4], m1 ; i16x16_h satd movd [r2+0], m0 ; i16x16_v satd ADD rsp, stack_pad RET;-----------------------------------------------------------------------------; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res );-----------------------------------------------------------------------------cglobal x264_intra_satd_x3_8x8c_%1, 0,6 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, 72%define sums rsp+48 ; size 24%define dc_1d rsp+32 ; size 16%define top_1d rsp+16 ; size 16%define left_1d rsp ; size 16 movifnidn r1, r1mp CLEAR_SUMS ; 1D hadamards mov t0d, 4.loop_edge: SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5 sub t0d, 4 jge .loop_edge ; dc movzx t2d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] add t2d, r3d lea r3, [r4 + r5] lea t2, [2*t2 + 8] lea r3, [2*r3 + 8] lea r4, [4*r4 + 8] lea r5, [4*r5 + 8] and t2d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr mov [dc_1d+ 0], t2d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br lea r5, [dc_1d] ; 2D hadamards movifnidn r0, r0mp movifnidn r2, r2mp xor r3d, r3d.loop_y: xor r4d, r4d.loop_x: call load_hadamard SUM3x4 %1 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+16] ; i4x4_v satd paddw m4, [sums+8] ; i4x4_h satd paddw m5, [sums+0] ; i4x4_dc satd movq [sums+16], m0 movq [sums+8], m4 movq [sums+0], m5 add r0, 4 inc r4d cmp r4d, 2 jl .loop_x add r0, 4*FENC_STRIDE-8 add r5, 8 inc r3d cmp r3d, 2 jl .loop_y; horizontal sum movq m0, [sums+0] movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET%endmacro ; INTRA_SATDS_MMX%macro ABS_MOV_SSSE3 2 pabsw %1, %2%endmacro%macro ABS_MOV_MMX 2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2%endmacro%define ABS_MOV ABS_MOV_MMX; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0; out: [tmp]=hadamard4, m0=satdcglobal x264_hadamard_ac_4x4_mmxext movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] movh m3, [r0+r2] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_2D 0, 1, 2, 3, 4 mova [r3], m0 mova [r3+8], m1 mova [r3+16], m2 mova [r3+24], m3 ABS1 m0, m4 ABS1 m1, m4 pand m0, m6 ABS1 m2, m4 ABS1 m3, m4 paddw m0, m1 paddw m2, m3 paddw m0, m2 SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext retcglobal x264_hadamard_ac_2x2max_mmxext mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] sub r3, 8 SUMSUB_BADC m0, m1, m2, m3, m4 ABS4 m0, m2, m1, m3, m4, m5 HADAMARD 0, max, 0, 2, 4, 5 HADAMARD 0, max, 1, 3, 4, 5 paddw m7, m0 paddw m7, m1 SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext retcglobal x264_hadamard_ac_8x8_mmxext mova m6, [mask_ac4 GLOBAL] pxor m7, m7 call x264_hadamard_ac_4x4_mmxext add r0, 4 add r3, 32 mova m5, m0 call x264_hadamard_ac_4x4_mmxext lea r0, [r0+4*r1] add r3, 64 paddw m5, m0 call x264_hadamard_ac_4x4_mmxext sub r0, 4 sub r3, 32 paddw m5, m0 call x264_hadamard_ac_4x4_mmxext paddw m5, m0 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd%rep 3 call x264_hadamard_ac_2x2max_mmxext%endrep mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] SUMSUB_BADC m0, m1, m2, m3, m4 HADAMARD 0, sumsub, 0, 2, 4, 5 ABS4 m1, m3, m0, m2, m4, m5 HADAMARD 0, max, 1, 3, 4, 5 pand m6, m0 paddw m7, m1 paddw m6, m2 paddw m7, m7 paddw m6, m7 mova [rsp+gprsize], m6 ; save sa8d SWAP m0, m6 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext ret%macro HADAMARD_AC_WXH_MMX 2cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 16+128+pad lea r2, [r1*3] lea r3, [rsp+16] call x264_hadamard_ac_8x8_mmxext%if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 16 call x264_hadamard_ac_8x8_mmxext%endif%if %1==16 neg ysub sub rsp, 16 lea r0, [r0+ysub*4+8] neg ysub call x264_hadamard_ac_8x8_mmxext%if %2==16 lea r0, [r0+r1*4] sub rsp, 16 call x264_hadamard_ac_8x8_mmxext%endif%endif mova m1, [rsp+0x08]%if %1*%2 >= 128 paddusw m0, [rsp+0x10] paddusw m1, [rsp+0x18]%endif%if %1*%2 == 256 mova m2, [rsp+0x20] paddusw m1, [rsp+0x28] paddusw m2, [rsp+0x30] mova m3, m0 paddusw m1, [rsp+0x38] pxor m3, m2 pand m3, [pw_1 GLOBAL] pavgw m0, m2 psubusw m0, m3 HADDUW m0, m2%else psrlw m0, 1 HADDW m0, m2%endif psrlw m1, 1 HADDW m1, m3 movd edx, m0 movd eax, m1 shr edx, 1%ifdef ARCH_X86_64 shl rdx, 32 add rax, rdx%endif add rsp, 128+%1*%2/4+pad RET%endmacro ; HADAMARD_AC_WXH_MMXHADAMARD_AC_WXH_MMX 16, 16HADAMARD_AC_WXH_MMX 8, 16HADAMARD_AC_WXH_MMX 16, 8HADAMARD_AC_WXH_MMX 8, 8%macro LOAD_INC_8x4W_SSE2 5 movh m%1, [r0] movh m%2, [r0+r1] movh m%3, [r0+r1*2] movh m%4, [r0+r2]%ifidn %1, 0 lea r0, [r0+r1*4]%endif punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5%endmacro%macro LOAD_INC_8x4W_SSSE3 5 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]%ifidn %1, 0 lea r0, [r0+r1*4]%endif HSUMSUB %1, %2, %3, %4, %5%endmacro%macro HADAMARD_AC_SSE2 1INIT_XMM; in: r0=pix, r1=stride, r2=stride*3; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4cglobal x264_hadamard_ac_8x8_%1%ifdef ARCH_X86_64 %define spill0 m8 %define spill1 m9 %define spill2 m10%else %define spill0 [rsp+gprsize] %define spill1 [rsp+gprsize+16] %define spill2 [rsp+gprsize+32]%endif%ifnidn %1, sse2 ;LOAD_INC loads sumsubs mova m7, [hmul_8p GLOBAL]%else ;LOAD_INC only unpacks to words pxor m7, m7%endif LOAD_INC_8x4W 0, 1, 2, 3, 7%ifidn %1, sse2 HADAMARD4_2D_SSE 0, 1, 2, 3, 4%else HADAMARD4_V m0, m1, m2, m3, m4%endif mova spill0, m1 SWAP 1, 7 LOAD_INC_8x4W 4, 5, 6, 7, 1%ifidn %1, sse2 HADAMARD4_2D_SSE 4, 5, 6, 7, 1%else HADAMARD4_V m4, m5, m6, m7, m1%endif%ifnidn %1, sse2 mova m1, spill0 mova spill0, m6 mova spill1, m7 HADAMARD 1, sumsub, 0, 1, 6, 7 HADAMARD 1, sumsub, 2, 3, 6, 7 mova m6, spill0 mova m7, spill1 mova spill0, m1 mova spill1, m0 HADAMARD 1, sumsub, 4, 5, 1, 0 HADAMARD 1, sumsub, 6, 7, 1, 0 mova m0, spill1%endif mova spill1, m2 mova spill2, m3 ABS_MOV m1, m0 ABS_MOV m2, m4 ABS_MOV m3, m5 paddw m1, m2 SUMSUB_BA m0, m4; m2%ifnidn %1, sse2 pand m1, [mask_ac4b GLOBAL]%else pand m1, [mask_ac4 GLOBAL]%endif ABS_MOV m2, spill0 paddw m1, m3 ABS_MOV m3, spill1 paddw m1, m2 ABS_MOV m2, spill2 paddw m1, m3 ABS_MOV m3, m6 paddw m1, m2 ABS_MOV m2, m7 paddw m1, m3 mova m3, m7 paddw m1, m2 mova m2, m6 psubw m7, spill2 paddw m3, spill2 mova [rsp+gprsize+32], m1 ; save satd mova m1, m5 psubw m6, spill1 paddw m2, spill1 psubw m5, spill0 paddw m1, spill0%ifnidn %1, sse2 mova spill1, m4 HADAMARD 2, amax, 3, 7, 4 HADAMARD 2, amax, 2, 6, 7, 4 mova m4, spill1 HADAMARD 2, amax, 1, 5, 6, 7 HADAMARD 2, sumsub, 0, 4, 5, 6%else mova spill1, m4 HADAMARD 4, amax, 3, 7, 4 HADAMARD 4, amax, 2, 6, 7, 4 mova m4, spill1 HADAMARD 4, amax, 1, 5, 6, 7 HADAMARD 4, sumsub, 0, 4, 5, 6%endif paddw m2, m3 paddw m2, m1 paddw m2, m2 ABS1 m4, m7 pand m0, [mask_ac8 GLOBAL] ABS1 m0, m7 paddw m2, m4 paddw m0, m2 mova [rsp+gprsize+16], m0 ; save sa8d SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -