📄 pixel-sse2.asm
字号:
%endmacro%macro SSD_END_SSE2 0 movdqa xmm1, xmm0 psrldq xmm1, 8 paddd xmm0, xmm1 movdqa xmm1, xmm0 psrldq xmm1, 4 paddd xmm0, xmm1 movd eax, xmm0 pop ebx ret%endmacro;-----------------------------------------------------------------------------; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x16_sse2 SSD_START_SSE2%rep 8 SSD_INC_2x16P_SSE2%endrep SSD_END_SSE2;-----------------------------------------------------------------------------; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x8_sse2 SSD_START_SSE2%rep 4 SSD_INC_2x16P_SSE2%endrep SSD_END_SSE2%macro SUMSUB_BADC 4 paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3 %endmacro %macro HADAMARD1x4 4 SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %1, %3, %2, %4%endmacro %macro SBUTTERFLY 5 mov%1 %5, %3 punpckl%2 %3, %4 punpckh%2 %5, %4%endmacro %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers mov%1 %5, %3 punpckh%2 %3, %4 punpckl%2 %5, %4%endmacro%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC SBUTTERFLY dqa, dq, %1, %2, %5 SBUTTERFLY dqa, dq, %3, %4, %2 SBUTTERFLY dqa, qdq, %1, %3, %4 SBUTTERFLY dqa, qdq, %5, %2, %3%endmacro%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD SBUTTERFLY dqa, wd, %1, %2, %5 SBUTTERFLY dqa, wd, %3, %4, %2 SBUTTERFLY dqa, dq, %1, %3, %4 SBUTTERFLY2 dqa, dq, %5, %2, %3 SBUTTERFLY dqa, qdq, %1, %3, %2 SBUTTERFLY2 dqa, qdq, %4, %5, %3%endmacro%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2] movq %1, %3 movq %2, %4 punpcklbw %1, %2 punpcklbw %2, %2 psubw %1, %2%endmacro%macro SUM4x4_SSE2 4 ; 02 13 junk sum pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 pxor %3, %3 psubw %3, %2 pmaxsw %2, %3 paddusw %4, %1 paddusw %4, %2%endmacro%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum pxor %3, %3 pxor %6, %6 psubw %3, %1 psubw %6, %4 pmaxsw %1, %3 pmaxsw %4, %6 pxor %3, %3 pxor %6, %6 psubw %3, %2 psubw %6, %5 pmaxsw %2, %3 pmaxsw %5, %6 paddusw %1, %2 paddusw %4, %5 paddusw %7, %1 paddusw %7, %4%endmacro%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum pabsw %1, %1 pabsw %2, %2 pabsw %4, %4 pabsw %5, %5 paddusw %1, %2 paddusw %4, %5 paddusw %7, %1 paddusw %7, %4%endmacro%macro SATD_TWO_SSE2 0 LOAD_DIFF_8P xmm0, xmm4, [eax], [ecx] LOAD_DIFF_8P xmm1, xmm5, [eax+ebx], [ecx+edx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] LOAD_DIFF_8P xmm2, xmm4, [eax], [ecx] LOAD_DIFF_8P xmm3, xmm5, [eax+ebx], [ecx+edx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6%endmacro%macro SATD_START 0 push ebx mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 mov edx, [esp+20] ; stride2 pxor xmm6, xmm6%endmacro%macro SATD_END 0 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. psrlw xmm6, 1 HADDW xmm6, xmm7 movd eax, xmm6 pop ebx ret%endmacro%macro SATDS 1;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x16_%1 SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 mov eax, [esp+ 8] mov ecx, [esp+16] add eax, 8 add ecx, 8 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x16_%1 SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x8_%1 SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 mov eax, [esp+ 8] mov ecx, [esp+16] add eax, 8 add ecx, 8 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x8_%1 SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_END;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x4_%1 SATD_START SATD_TWO_SSE2 SATD_END%endmacro ; SATDS%define SUM8x4 SUM8x4_SSE2SATDS sse2%ifdef HAVE_SSE3%define SUM8x4 SUM8x4_SSSE3SATDS ssse3%endif;-----------------------------------------------------------------------------; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,; const uint8_t *pix2, int stride2, int sums[2][4] );-----------------------------------------------------------------------------cglobal x264_pixel_ssim_4x4x2_core_sse2 push ebx mov eax, [esp+ 8] mov ebx, [esp+12] mov ecx, [esp+16] mov edx, [esp+20] pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 pxor xmm3, xmm3 pxor xmm4, xmm4%rep 4 movq xmm5, [eax] movq xmm6, [ecx] punpcklbw xmm5, xmm0 punpcklbw xmm6, xmm0 paddw xmm1, xmm5 paddw xmm2, xmm6 movdqa xmm7, xmm5 pmaddwd xmm5, xmm5 pmaddwd xmm7, xmm6 pmaddwd xmm6, xmm6 paddd xmm3, xmm5 paddd xmm4, xmm7 paddd xmm3, xmm6 add eax, ebx add ecx, edx%endrep ; PHADDW xmm1, xmm2 ; PHADDD xmm3, xmm4 mov eax, [esp+24] picgetgot ebx movdqa xmm7, [pw_1 GOT_ebx] pshufd xmm5, xmm3, 0xB1 pmaddwd xmm1, xmm7 pmaddwd xmm2, xmm7 pshufd xmm6, xmm4, 0xB1 packssdw xmm1, xmm2 paddd xmm3, xmm5 pmaddwd xmm1, xmm7 paddd xmm4, xmm6 pshufd xmm1, xmm1, 0xD8 movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4 movq [eax+ 0], xmm1 movq [eax+ 8], xmm3 psrldq xmm1, 8 movq [eax+16], xmm1 movq [eax+24], xmm5 pop ebx ret;-----------------------------------------------------------------------------; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width );-----------------------------------------------------------------------------cglobal x264_pixel_ssim_end4_sse2 mov eax, [esp+ 4] mov ecx, [esp+ 8] mov edx, [esp+12] picpush ebx picgetgot ebx movdqa xmm0, [eax+ 0] movdqa xmm1, [eax+16] movdqa xmm2, [eax+32] movdqa xmm3, [eax+48] movdqa xmm4, [eax+64] paddd xmm0, [ecx+ 0] paddd xmm1, [ecx+16] paddd xmm2, [ecx+32] paddd xmm3, [ecx+48] paddd xmm4, [ecx+64] paddd xmm0, xmm1 paddd xmm1, xmm2 paddd xmm2, xmm3 paddd xmm3, xmm4 movdqa xmm5, [ssim_c1 GOT_ebx] movdqa xmm6, [ssim_c2 GOT_ebx] TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4; s1=mm0, s2=mm3, ss=mm4, s12=mm2 movdqa xmm1, xmm3 pslld xmm3, 16 pmaddwd xmm1, xmm0 ; s1*s2 por xmm0, xmm3 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 pslld xmm1, 1 pslld xmm2, 7 pslld xmm4, 6 psubd xmm2, xmm1 ; covar*2 psubd xmm4, xmm0 ; vars paddd xmm0, xmm5 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm4, xmm6 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) mulps xmm1, xmm2 mulps xmm0, xmm4 divps xmm1, xmm0 ; ssim neg edx movdqu xmm3, [mask_ff + edx*4 + 16 GOT_ebx] pand xmm1, xmm3 movhlps xmm0, xmm1 addps xmm0, xmm1 pshuflw xmm1, xmm0, 0xE addss xmm0, xmm1 movd [picesp+4], xmm0 fld dword [picesp+4] picpop ebx ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -