📄 pixel-a.asm
字号:
%macro TRANSPOSE4x4 5 ; abcd-t -> adtc SBUTTERFLYwd %1, %2, %5 SBUTTERFLYwd %3, %4, %2 SBUTTERFLYdq %1, %3, %4 SBUTTERFLYdq %5, %2, %3%endmacro%macro MMX_ABS 2 ; mma, tmp pxor %2, %2 psubw %2, %1 pmaxsw %1, %2%endmacro%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 pxor %3, %3 pxor %4, %4 psubw %3, %1 psubw %4, %2 pmaxsw %1, %3 pmaxsw %2, %4%endmacro%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) HADAMARD4x4 mm4, mm5, mm6, mm7 TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1 HADAMARD4x4 mm4, mm7, %1, mm6 MMX_ABS_TWO mm4, mm7, mm3, mm5 MMX_ABS_TWO %1, mm6, mm3, mm5 paddw %1, mm4 paddw mm6, mm7 pavgw %1, mm6%endmacro; in: r10=3*stride1, r11=3*stride2; in: %2 = horizontal offset; in: %3 = whether we need to increment pix1 and pix2; clobber: mm3..mm7; out: %1 = satd%macro LOAD_DIFF_HADAMARD_SUM 3 LOAD_DIFF_4P mm4, mm3, [parm1q+%2], [parm3q+%2] LOAD_DIFF_4P mm5, mm3, [parm1q+parm2q+%2], [parm3q+parm4q+%2] LOAD_DIFF_4P mm6, mm3, [parm1q+2*parm2q+%2], [parm3q+2*parm4q+%2] LOAD_DIFF_4P mm7, mm3, [parm1q+r10+%2], [parm3q+r11+%2]%if %3 lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q]%endif HADAMARD4x4_SUM %1%endmacro;=============================================================================; Code;=============================================================================SECTION .text%macro SAD_START 0 pxor mm0, mm0%endmacro%macro SAD_END 0 movd eax, mm0 ret%endmacro;-----------------------------------------------------------------------------; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------%macro SAD 2cglobal x264_pixel_sad_%1x%2_mmxext SAD_START%rep %2/2 SAD_INC_2x%1P%endrep SAD_END%endmacroSAD 16, 16SAD 16, 8SAD 8, 16SAD 8, 8SAD 8, 4SAD 4, 8SAD 4, 4;-----------------------------------------------------------------------------; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,; uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X 3cglobal x264_pixel_sad_x%1_%2x%3_mmxext SAD_X%1_2x%2P 1%rep %3/2-1 SAD_X%1_2x%2P 0%endrep SAD_X%1_END%endmacroSAD_X 3, 16, 16SAD_X 3, 16, 8SAD_X 3, 8, 16SAD_X 3, 8, 8SAD_X 3, 8, 4SAD_X 3, 4, 8SAD_X 3, 4, 4SAD_X 4, 16, 16SAD_X 4, 16, 8SAD_X 4, 8, 16SAD_X 4, 8, 8SAD_X 4, 8, 4SAD_X 4, 4, 8SAD_X 4, 4, 4%macro PDE_CHECK 0 movd eax, mm0 cmp eax, parm5d ; prev_score jl .continue retALIGN 4.continue:%endmacro;-----------------------------------------------------------------------------; int x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int );-----------------------------------------------------------------------------%macro SAD_PDE 2 cglobal x264_pixel_sad_pde_%1x%2_mmxext SAD_START%rep %2/4 SAD_INC_2x%1P%endrep movd eax, mm0 cmp eax, parm5d ; prev_score jl .continue retALIGN 4.continue:%rep %2/4 SAD_INC_2x%1P%endrep SAD_END%endmacroSAD_PDE 16, 16SAD_PDE 16 , 8SAD_PDE 8, 16%macro SSD_START 0 pxor mm7, mm7 ; zero pxor mm0, mm0 ; mm0 holds the sum%endmacro%macro SSD_END 0 movq mm1, mm0 psrlq mm1, 32 paddd mm0, mm1 movd eax, mm0 ret%endmacro;-----------------------------------------------------------------------------; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------%macro SSD 2cglobal x264_pixel_ssd_%1x%2_mmx SSD_START%rep %2 SSD_INC_1x%1P%endrep SSD_END%endmacroSSD 16, 16SSD 16, 8SSD 8, 16SSD 8, 8SSD 8, 4SSD 4, 8SSD 4, 4%macro SATD_START 0 lea r10, [3*parm2q] ; 3*stride1 lea r11, [3*parm4q] ; 3*stride2%endmacro%macro SATD_END 0 pshufw mm1, mm0, 01001110b paddw mm0, mm1 pshufw mm1, mm0, 10110001b paddw mm0, mm1 movd eax, mm0 and eax, 0xffff ret%endmacro;-----------------------------------------------------------------------------; int x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_4x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 SATD_END;-----------------------------------------------------------------------------; int x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_4x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm1, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 paddw mm0, mm1 pxor mm3, mm3 pshufw mm1, mm0, 01001110b paddw mm0, mm1 punpcklwd mm0, mm3 pshufw mm1, mm0, 01001110b paddd mm0, mm1 movd eax, mm0 ret; in: parm1 = fenc; out: mm0..mm3 = hadamard coefsALIGN 16load_hadamard: pxor mm7, mm7 movd mm0, [parm1q+0*FENC_STRIDE] movd mm4, [parm1q+1*FENC_STRIDE] movd mm3, [parm1q+2*FENC_STRIDE] movd mm1, [parm1q+3*FENC_STRIDE] punpcklbw mm0, mm7 punpcklbw mm4, mm7 punpcklbw mm3, mm7 punpcklbw mm1, mm7 HADAMARD4x4 mm0, mm4, mm3, mm1 TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 HADAMARD4x4 mm0, mm1, mm2, mm3 ret%macro SCALAR_SUMSUB 4 add %1, %2 add %3, %4 add %2, %2 add %4, %4 sub %2, %1 sub %4, %3%endmacro%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b paddw %1, %4 paddw %2, %5 paddw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b %8 %1, %4 %8 %2, %5 %8 %3, %6%endmacro;-----------------------------------------------------------------------------
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -