📄 pixel-a.asm
字号:
HADAMARD_AC_WXH_SSE2 16, 16, %1HADAMARD_AC_WXH_SSE2 8, 16, %1HADAMARD_AC_WXH_SSE2 16, 8, %1HADAMARD_AC_WXH_SSE2 8, 8, %1%endmacro ; HADAMARD_AC_SSE2; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )%macro HADAMARD_AC_WXH_SSE2 3cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 48+pad lea r2, [r1*3] call x264_hadamard_ac_8x8_%3%if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 32 call x264_hadamard_ac_8x8_%3%endif%if %1==16 neg ysub sub rsp, 32 lea r0, [r0+ysub*4+8] neg ysub call x264_hadamard_ac_8x8_%3%if %2==16 lea r0, [r0+r1*4] sub rsp, 32 call x264_hadamard_ac_8x8_%3%endif%endif mova m1, [rsp+0x20]%if %1*%2 >= 128 paddusw m0, [rsp+0x30] paddusw m1, [rsp+0x40]%endif%if %1*%2 == 256 paddusw m0, [rsp+0x50] paddusw m1, [rsp+0x60] paddusw m0, [rsp+0x70] paddusw m1, [rsp+0x80] psrlw m0, 1%endif HADDW m0, m2 HADDW m1, m3 movd edx, m0 movd eax, m1 shr edx, 2 - (%1*%2 >> 8) shr eax, 1%ifdef ARCH_X86_64 shl rdx, 32 add rax, rdx%endif add rsp, 16+%1*%2/2+pad RET%endmacro ; HADAMARD_AC_WXH_SSE2; instantiate satds%ifndef ARCH_X86_64cextern x264_pixel_sa8d_8x8_internal_mmxextSA8D mmxext%endif%define TRANS TRANS_SSE2%define ABS1 ABS1_MMX%define ABS2 ABS2_MMX%define DIFFOP DIFF_UNPACK_SSE2%define JDUP JDUP_SSE2%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size%define movdqu movups%define punpcklqdq movlhpsINIT_XMMSA8D sse2SATDS_SSE2 sse2INTRA_SA8D_SSE2 sse2INTRA_SATDS_MMX mmxextHADAMARD_AC_SSE2 sse2%define ABS1 ABS1_SSSE3%define ABS2 ABS2_SSSE3%define ABS_MOV ABS_MOV_SSSE3%define DIFFOP DIFF_SUMSUB_SSSE3%define JDUP JDUP_CONROE%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3SATDS_SSE2 ssse3SA8D ssse3HADAMARD_AC_SSE2 ssse3%undef movdqa ; nehalem doesn't like movaps%undef movdqu ; movups%undef punpcklqdq ; or movlhpsINTRA_SA8D_SSE2 ssse3INTRA_SATDS_MMX ssse3%define TRANS TRANS_SSE4%define JDUP JDUP_PENRYN%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYNSATDS_SSE2 sse4SA8D sse4HADAMARD_AC_SSE2 sse4;=============================================================================; SSIM;=============================================================================;-----------------------------------------------------------------------------; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,; const uint8_t *pix2, int stride2, int sums[2][4] );-----------------------------------------------------------------------------%macro SSIM_ITER 1 movq m5, [r0+(%1&1)*r1] movq m6, [r2+(%1&1)*r3] punpcklbw m5, m0 punpcklbw m6, m0%if %1==1 lea r0, [r0+r1*2] lea r2, [r2+r3*2]%endif%if %1==0 movdqa m1, m5 movdqa m2, m6%else paddw m1, m5 paddw m2, m6%endif movdqa m7, m5 pmaddwd m5, m5 pmaddwd m7, m6 pmaddwd m6, m6%if %1==0 SWAP m3, m5 SWAP m4, m7%else paddd m3, m5 paddd m4, m7%endif paddd m3, m6%endmacrocglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8 pxor m0, m0 SSIM_ITER 0 SSIM_ITER 1 SSIM_ITER 2 SSIM_ITER 3 ; PHADDW m1, m2 ; PHADDD m3, m4 movdqa m7, [pw_1 GLOBAL] pshufd m5, m3, 0xb1 pmaddwd m1, m7 pmaddwd m2, m7 pshufd m6, m4, 0xb1 packssdw m1, m2 paddd m3, m5 pshufd m1, m1, 0xd8 paddd m4, m6 pmaddwd m1, m7 movdqa m5, m3 punpckldq m3, m4 punpckhdq m5, m4%ifdef UNIX64 %define t0 r4%else %define t0 rax mov t0, r4mp%endif movq [t0+ 0], m1 movq [t0+ 8], m3 movhps [t0+16], m1 movq [t0+24], m5 RET;-----------------------------------------------------------------------------; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width );-----------------------------------------------------------------------------cglobal x264_pixel_ssim_end4_sse2, 3,3,7 movdqa m0, [r0+ 0] movdqa m1, [r0+16] movdqa m2, [r0+32] movdqa m3, [r0+48] movdqa m4, [r0+64] paddd m0, [r1+ 0] paddd m1, [r1+16] paddd m2, [r1+32] paddd m3, [r1+48] paddd m4, [r1+64] paddd m0, m1 paddd m1, m2 paddd m2, m3 paddd m3, m4 movdqa m5, [ssim_c1 GLOBAL] movdqa m6, [ssim_c2 GLOBAL] TRANSPOSE4x4D 0, 1, 2, 3, 4; s1=m0, s2=m1, ss=m2, s12=m3 movdqa m4, m1 pslld m1, 16 pmaddwd m4, m0 ; s1*s2 por m0, m1 pmaddwd m0, m0 ; s1*s1 + s2*s2 pslld m4, 1 pslld m3, 7 pslld m2, 6 psubd m3, m4 ; covar*2 psubd m2, m0 ; vars paddd m0, m5 paddd m4, m5 paddd m3, m6 paddd m2, m6 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) mulps m4, m3 mulps m0, m2 divps m4, m0 ; ssim cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2%ifdef PIC lea r3, [mask_ff + 16 GLOBAL] movdqu m1, [r3 + r2*4]%else movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]%endif pand m4, m1.skip: movhlps m0, m4 addps m0, m4 pshuflw m4, m0, 0xE addss m0, m4%ifndef ARCH_X86_64 movd r0m, m0 fld dword r0m%endif RET;=============================================================================; Successive Elimination ADS;=============================================================================%macro ADS_START 1 ; unroll_size%ifdef ARCH_X86_64 %define t0 r6%ifdef WIN64 mov r4, r4mp movsxd r5, dword r5m%endif mov r10, rsp%else %define t0 r4 mov rbp, rsp%endif mov r0d, r5m sub rsp, r0 sub rsp, %1*4-1 and rsp, ~15 mov t0, rsp shl r2d, 1%endmacro%macro ADS_END 1 add r1, 8*%1 add r3, 8*%1 add t0, 4*%1 sub r0d, 4*%1 jg .loop%ifdef WIN64 RESTORE_XMM r10%endif jmp ads_mvs%endmacro%define ABS1 ABS1_MMX;-----------------------------------------------------------------------------; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );-----------------------------------------------------------------------------cglobal x264_pixel_ads4_mmxext, 4,7 movq mm6, [r0] movq mm4, [r0+8] pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA pshufw mm5, mm4, 0 pshufw mm4, mm4, 0xAA ADS_START 1.loop: movq mm0, [r1] movq mm1, [r1+16] psubw mm0, mm7 psubw mm1, mm6 ABS1 mm0, mm2 ABS1 mm1, mm3 movq mm2, [r1+r2] movq mm3, [r1+r2+16] psubw mm2, mm5 psubw mm3, mm4 paddw mm0, mm1 ABS1 mm2, mm1 ABS1 mm3, mm1 paddw mm0, mm2 paddw mm0, mm3%ifdef WIN64 pshufw mm1, [r10+stack_offset+56], 0%elifdef ARCH_X86_64 pshufw mm1, [r10+8], 0%else pshufw mm1, [ebp+stack_offset+28], 0%endif paddusw mm0, [r3] psubusw mm1, mm0 packsswb mm1, mm1 movd [t0], mm1 ADS_END 1cglobal x264_pixel_ads2_mmxext, 4,7 movq mm6, [r0] pshufw mm5, r6m, 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA ADS_START 1.loop: movq mm0, [r1] movq mm1, [r1+r2] psubw mm0, mm7 psubw mm1, mm6 ABS1 mm0, mm2 ABS1 mm1, mm3 paddw mm0, mm1 paddusw mm0, [r3] movq mm4, mm5 psubusw mm4, mm0 packsswb mm4, mm4 movd [t0], mm4 ADS_END 1cglobal x264_pixel_ads1_mmxext, 4,7 pshufw mm7, [r0], 0 pshufw mm6, r6m, 0 ADS_START 2.loop: movq mm0, [r1] movq mm1, [r1+8] psubw mm0, mm7 psubw mm1, mm7 ABS1 mm0, mm2 ABS1 mm1, mm3 paddusw mm0, [r3] paddusw mm1, [r3+8] movq mm4, mm6 movq mm5, mm6 psubusw mm4, mm0 psubusw mm5, mm1 packsswb mm4, mm5 movq [t0], mm4 ADS_END 2%macro ADS_SSE2 1cglobal x264_pixel_ads4_%1, 4,7,12 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, 0xAA pshufhw xmm5, xmm4, 0 pshufhw xmm4, xmm4, 0xAA punpcklqdq xmm7, xmm7 punpcklqdq xmm6, xmm6 punpckhqdq xmm5, xmm5 punpckhqdq xmm4, xmm4%ifdef ARCH_X86_64 pshuflw xmm8, r6m, 0 punpcklqdq xmm8, xmm8 ADS_START 2 movdqu xmm10, [r1] movdqu xmm11, [r1+r2].loop: movdqa xmm0, xmm10 movdqu xmm1, [r1+16] movdqa xmm10, xmm1 psubw xmm0, xmm7 psubw xmm1, xmm6 ABS1 xmm0, xmm2 ABS1 xmm1, xmm3 movdqa xmm2, xmm11 movdqu xmm3, [r1+r2+16] movdqa xmm11, xmm3 psubw xmm2, xmm5 psubw xmm3, xmm4 paddw xmm0, xmm1 movdqu xmm9, [r3] ABS1 xmm2, xmm1 ABS1 xmm3, xmm1 paddw xmm0, xmm2 paddw xmm0, xmm3 paddusw xmm0, xmm9 movdqa xmm1, xmm8 psubusw xmm1, xmm0 packsswb xmm1, xmm1 movq [t0], xmm1%else ADS_START 2.loop: movdqu xmm0, [r1] movdqu xmm1, [r1+16] psubw xmm0, xmm7 psubw xmm1, xmm6 ABS1 xmm0, xmm2 ABS1 xmm1, xmm3 movdqu xmm2, [r1+r2] movdqu xmm3, [r1+r2+16] psubw xmm2, xmm5 psubw xmm3, xmm4 paddw xmm0, xmm1 ABS1 xmm2, xmm1 ABS1 xmm3, xmm1 paddw xmm0, xmm2 paddw xmm0, xmm3 movd xmm1, [ebp+stack_offset+28] movdqu xmm2, [r3] pshuflw xmm1, xmm1, 0 punpcklqdq xmm1, xmm1 paddusw xmm0, xmm2 psubusw xmm1, xmm0 packsswb xmm1, xmm1 movq [t0], xmm1%endif ; ARCH ADS_END 2cglobal x264_pixel_ads2_%1, 4,7,8 movq xmm6, [r0] movd xmm5, r6m pshuflw xmm7, xmm6, 0 pshuflw xmm6, xmm6, 0xAA pshuflw xmm5, xmm5, 0 punpcklqdq xmm7, xmm7 punpcklqdq xmm6, xmm6 punpcklqdq xmm5, xmm5 ADS_START 2.loop: movdqu xmm0, [r1] movdqu xmm1, [r1+r2] psubw xmm0, xmm7 psubw xmm1, xmm6 movdqu xmm4, [r3] ABS1 xmm0, xmm2 ABS1 xmm1, xmm3 paddw xmm0, xmm1 paddusw xmm0, xmm4 movdqa xmm1, xmm5 psubusw xmm1, xmm0 packsswb xmm1, xmm1 movq [t0], xmm1 ADS_END 2cglobal x264_pixel_ads1_%1, 4,7,8 movd xmm7, [r0] movd xmm6, r6m pshuflw xmm7, xmm7, 0 pshuflw xmm6, xmm6, 0 punpcklqdq xmm7, xmm7 punpcklqdq xmm6, xmm6 ADS_START 4.loop: movdqu xmm0, [r1] movdqu xmm1, [r1+16] psubw xmm0, xmm7 psubw xmm1, xmm7 movdqu xmm2, [r3] movdqu xmm3, [r3+16] ABS1 xmm0, xmm4 ABS1 xmm1, xmm5 paddusw xmm0, xmm2 paddusw xmm1, xmm3 movdqa xmm4, xmm6 movdqa xmm5, xmm6 psubusw xmm4, xmm0 psubusw xmm5, xmm1 packsswb xmm4, xmm5 movdqa [t0], xmm4 ADS_END 4%endmacroADS_SSE2 sse2%define ABS1 ABS1_SSSE3ADS_SSE2 ssse3; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ); {; int nmv=0, i, j;; *(uint32_t*)(masks+width) = 0;; for( i=0; i<width; i+=8 ); {; uint64_t mask = *(uint64_t*)(masks+i);; if( !mask ) continue;; for( j=0; j<8; j++ ); if( mask & (255<<j*8) ); mvs[nmv++] = i+j;; }; return nmv;; }cglobal x264_pixel_ads_mvs, 0,7,0ads_mvs:%ifdef ARCH_X86_64 ; mvs = r4 ; masks = rsp ; width = r5 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)%ifdef WIN64 mov r8, r4 mov r9, r5%endif xor eax, eax xor esi, esi mov dword [rsp+r9], 0 jmp .loopi.loopi0: add esi, 8 cmp esi, r9d jge .end.loopi: mov rdi, [rsp+rsi] test rdi, rdi jz .loopi0 xor ecx, ecx%macro TEST 1 mov [r8+rax*2], si test edi, 0xff<<(%1*8) setne cl add eax, ecx inc esi%endmacro TEST 0 TEST 1 TEST 2 TEST 3 shr rdi, 32 TEST 0 TEST 1 TEST 2 TEST 3 cmp esi, r9d jl .loopi.end: mov rsp, r10 RET%else xor eax, eax xor esi, esi mov ebx, [ebp+stack_offset+20] ; mvs mov edi, [ebp+stack_offset+24] ; width mov dword [esp+edi], 0 push ebp jmp .loopi.loopi0: add esi, 8 cmp esi, edi jge .end.loopi: mov ebp, [esp+esi+4] mov edx, [esp+esi+8] mov ecx, ebp or ecx, edx jz .loopi0 xor ecx, ecx%macro TEST 2 mov [ebx+eax*2], si test %2, 0xff<<(%1*8) setne cl add eax, ecx inc esi%endmacro TEST 0, ebp TEST 1, ebp TEST 2, ebp TEST 3, ebp TEST 0, edx TEST 1, edx TEST 2, edx TEST 3, edx cmp esi, edi jl .loopi.end: pop esp RET%endif ; ARCH
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -