📄 pixel-a.asm
字号:
.continue: mov ebx, [esp+12]%rep %2/4 SAD_INC_2x%1P%endrep SAD_END%endmacroSAD_PDE 16, 16SAD_PDE 16 , 8SAD_PDE 8, 16%macro SSD_START 0 push ebx mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 mov edx, [esp+20] ; stride2 pxor mm7, mm7 ; zero pxor mm0, mm0 ; mm0 holds the sum%endmacro%macro SSD_END 0 movq mm1, mm0 psrlq mm1, 32 paddd mm0, mm1 movd eax, mm0 pop ebx ret%endmacro;-----------------------------------------------------------------------------; int __cdecl x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------%macro SSD 2cglobal x264_pixel_ssd_%1x%2_mmx SSD_START%rep %2 SSD_INC_1x%1P%endrep SSD_END%endmacroSSD 16, 16SSD 16, 8SSD 8, 16SSD 8, 8SSD 8, 4SSD 4, 8SSD 4, 4%macro SATD_START 0 push ebx mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 mov edx, [esp+20] ; stride2%endmacro%macro SATD_END 0 pshufw mm1, mm0, 01001110b paddw mm0, mm1 pshufw mm1, mm0, 10110001b paddw mm0, mm1 movd eax, mm0 and eax, 0xffff pop ebx ret%endmacro;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_4x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_4x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 sub eax, ebx sub ecx, edx LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 8, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 paddw mm0, mm1 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 8, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 8, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 paddw mm0, mm1 pxor mm3, mm3 pshufw mm1, mm0, 01001110b paddw mm0, mm1 punpcklwd mm0, mm3 pshufw mm1, mm0, 01001110b paddd mm0, mm1 movd eax, mm0 pop ebx ret%macro LOAD_DIFF_4x8P 1 ; dx LOAD_DIFF_4P mm0, mm7, %1, 0 LOAD_DIFF_4P mm1, mm7, %1, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] LOAD_DIFF_4P mm2, mm7, %1, 0 LOAD_DIFF_4P mm3, mm7, %1, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] LOAD_DIFF_4P mm4, mm7, %1, 0 LOAD_DIFF_4P mm5, mm7, %1, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] LOAD_DIFF_4P mm6, mm7, %1, 0 movq [spill], mm6 LOAD_DIFF_4P mm7, mm6, %1, 1 movq mm6, [spill]%endmacro%macro HADAMARD1x8 8 SUMSUB_BADC %1, %5, %2, %6 SUMSUB_BADC %3, %7, %4, %8 SUMSUB_BADC %1, %3, %2, %4 SUMSUB_BADC %5, %7, %6, %8 SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %5, %6, %7, %8%endmacro%macro SUM4x8_MM 0 movq [spill], mm6 movq [spill+8], mm7 MMX_ABS_TWO mm0, mm1, mm6, mm7 MMX_ABS_TWO mm2, mm3, mm6, mm7 paddw mm0, mm2 paddw mm1, mm3 movq mm6, [spill] movq mm7, [spill+8] MMX_ABS_TWO mm4, mm5, mm2, mm3 MMX_ABS_TWO mm6, mm7, mm2, mm3 paddw mm4, mm6 paddw mm5, mm7 paddw mm0, mm4 paddw mm1, mm5 paddw mm0, mm1%endmacro;-----------------------------------------------------------------------------; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_sa8d_8x8_mmxext SATD_START sub esp, 0x70%define args esp+0x74%define spill esp+0x60 ; +16%define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 movq [spill], mm0 TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 movq [trans+0x00], mm4 movq [trans+0x08], mm7 movq [trans+0x10], mm0 movq [trans+0x18], mm6 movq mm0, [spill] TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 movq [trans+0x20], mm0 movq [trans+0x28], mm3 movq [trans+0x30], mm4 movq [trans+0x38], mm2 mov eax, [args+4] mov ecx, [args+12] LOAD_DIFF_4x8P 4 HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 movq [spill], mm7 TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7 movq [trans+0x40], mm0 movq [trans+0x48], mm3 movq [trans+0x50], mm7 movq [trans+0x58], mm2 movq mm7, [spill] TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 movq mm5, [trans+0x00] movq mm1, [trans+0x08] movq mm2, [trans+0x10] movq mm3, [trans+0x18] HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 SUM4x8_MM movq [trans], mm0 movq mm0, [trans+0x20] movq mm1, [trans+0x28] movq mm2, [trans+0x30] movq mm3, [trans+0x38] movq mm4, [trans+0x40] movq mm5, [trans+0x48] movq mm6, [trans+0x50] movq mm7, [trans+0x58] HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 SUM4x8_MM pavgw mm0, [esp] pshufw mm1, mm0, 01001110b paddw mm0, mm1 pshufw mm1, mm0, 10110001b paddw mm0, mm1 movd eax, mm0 and eax, 0xffff mov ecx, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 add esp, 0x70 pop ebx ret%undef args%undef spill%undef trans;-----------------------------------------------------------------------------; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------;; violates calling conventioncglobal x264_pixel_sa8d_16x16_mmxext push esi push edi push ebp mov esi, [esp+28] ; stride2 mov edi, [esp+20] ; stride1 push esi push dword [esp+28] ; pix2 push edi push dword [esp+28] ; pix1 call x264_pixel_sa8d_8x8_mmxext mov ebp, ecx shl edi, 3 shl esi, 3 add [esp+0], edi ; pix1+8*stride1 add [esp+8], esi ; pix2+8*stride2 call x264_pixel_sa8d_8x8_mmxext add ebp, ecx add dword [esp+0], 8 ; pix1+8*stride1+8 add dword [esp+8], 8 ; pix2+8*stride2+8 call x264_pixel_sa8d_8x8_mmxext add ebp, ecx sub [esp+0], edi ; pix1+8 sub [esp+8], esi ; pix2+8 call x264_pixel_sa8d_8x8_mmxext lea eax, [ebp+ecx+1] shr eax, 1 add esp, 16 pop ebp pop edi pop esi ret; in: fenc; out: mm0..mm3 = hadamard coefs%macro LOAD_HADAMARD 1 pxor mm7, mm7 movd mm0, [%1+0*FENC_STRIDE] movd mm4, [%1+1*FENC_STRIDE] movd mm3, [%1+2*FENC_STRIDE] movd mm1, [%1+3*FENC_STRIDE] punpcklbw mm0, mm7 punpcklbw mm4, mm7 punpcklbw mm3, mm7 punpcklbw mm1, mm7 HADAMARD4x4 mm0, mm4, mm3, mm1 TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 HADAMARD4x4 mm0, mm1, mm2, mm3%endmacro%macro SCALAR_SUMSUB 4 add %1, %2 add %3, %4 add %2, %2 add %4, %4 sub %2, %1 sub %4, %3%endmacro%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b paddusw %1, %4 paddusw %2, %5 paddusw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b %8 %1, %4 %8 %2, %5 %8 %3, %6%endmacro;-----------------------------------------------------------------------------; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res );-----------------------------------------------------------------------------cglobal x264_intra_satd_x3_4x4_mmxext push ebx push edi push esi sub esp, 16%define args esp+32%define top_1d esp+8 ; +8%define left_1d esp+0 ; +8 mov eax, [args+0] ; fenc LOAD_HADAMARD eax mov edi, [args+4] ; fdec movzx eax, byte [edi-1+0*FDEC_STRIDE] movzx ebx, byte [edi-1+1*FDEC_STRIDE] movzx ecx, byte [edi-1+2*FDEC_STRIDE] movzx edx, byte [edi-1+3*FDEC_STRIDE] SCALAR_SUMSUB eax, ebx, ecx, edx SCALAR_SUMSUB eax, ecx, ebx, edx ; 1x4 hadamard mov [left_1d+0], ax mov [left_1d+2], bx mov [left_1d+4], cx mov [left_1d+6], dx mov esi, eax ; dc movzx eax, byte [edi-FDEC_STRIDE+0] movzx ebx, byte [edi-FDEC_STRIDE+1] movzx ecx, byte [edi-FDEC_STRIDE+2] movzx edx, byte [edi-FDEC_STRIDE+3] SCALAR_SUMSUB eax, ebx, ecx, edx SCALAR_SUMSUB eax, ecx, ebx, edx ; 4x1 hadamard mov [top_1d+0], ax mov [top_1d+2], bx mov [top_1d+4], cx mov [top_1d+6], dx lea esi, [esi + eax + 4] ; dc and esi, -8 shl esi, 1 movq mm4, mm1 movq mm5, mm2 MMX_ABS_TWO mm4, mm5, mm6, mm7 movq mm7, mm3 paddw mm4, mm5 MMX_ABS mm7, mm6 paddw mm7, mm4 ; 3x4 sum movq mm4, [left_1d] movd mm5, esi psllw mm4, 2 psubw mm4, mm0 psubw mm5, mm0 punpcklwd mm0, mm1 punpcklwd mm2, mm3 punpckldq mm0, mm2 ; transpose movq mm1, [top_1d] psllw mm1, 2 psubw mm0, mm1 MMX_ABS mm4, mm3 ; 1x4 sum MMX_ABS mm5, mm2 ; 1x4 sum MMX_ABS mm0, mm1 ; 4x1 sum paddw mm4, mm7 paddw mm5, mm7 movq mm1, mm5 psrlq mm1, 16 ; 4x3 sum paddw mm0, mm1 SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw mov eax, [args+8] ; res movd [eax+0], mm0 ; i4x4_v satd movd [eax+4], mm4 ; i4x4_h satd movd [eax+8], mm5 ; i4x4_dc satd add esp, 16 pop esi pop edi pop ebx ret;-----------------------------------------------------------------------------; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res );-----------------------------------------------------------------------------cglobal x264_intra_satd_x3_16x16_mmxext push ebx push ebp push edi push esi sub esp, 88%define args esp+108%define sums esp+64 ; +24%define top_1d esp+32 ; +32%define left_1d esp+0 ; +32 pxor mm0, mm0 movq [sums+0], mm0 movq [sums+8], mm0 movq [sums+16], mm0 ; 1D hadamards mov edi, [args+4] ; fdec xor ebp, ebp mov esi, 12.loop_edge: ; left shl esi, 5 ; log(FDEC_STRIDE) movzx eax, byte [edi+esi-1+0*FDEC_STRIDE] movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE] movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE] movzx edx, byte [edi+esi-1+3*FDEC_STRIDE] shr esi, 5 SCALAR_SUMSUB eax, ebx, ecx, edx SCALAR_SUMSUB eax, ecx, ebx, edx add ebp, eax mov [left_1d+2*esi+0], ax mov [left_1d+2*esi+2], bx mov [left_1d+2*esi+4], cx mov [left_1d+2*esi+6], dx ; top movzx eax, byte [edi+esi-FDEC_STRIDE+0] movzx ebx, byte [edi+esi-FDEC_STRIDE+1]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -