⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pixel-sse2.asm

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 ASM
📖 第 1 页 / 共 2 页
字号:
%endmacro%macro SSD_END_SSE2 0    movdqa  xmm1,   xmm0    psrldq  xmm1,    8    paddd   xmm0,   xmm1    movdqa  xmm1,   xmm0    psrldq  xmm1,    4    paddd   xmm0,   xmm1    movd    eax,    xmm0    pop ebx    ret%endmacro;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x16_sse2    SSD_START_SSE2%rep 8    SSD_INC_2x16P_SSE2%endrep    SSD_END_SSE2;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x8_sse2    SSD_START_SSE2%rep 4    SSD_INC_2x16P_SSE2%endrep    SSD_END_SSE2%macro SUMSUB_BADC 4    paddw   %1, %2    paddw   %3, %4    paddw   %2, %2    paddw   %4, %4    psubw   %2, %1    psubw   %4, %3  %endmacro    %macro HADAMARD1x4 4    SUMSUB_BADC %1, %2, %3, %4    SUMSUB_BADC %1, %3, %2, %4%endmacro    %macro SBUTTERFLY 5    mov%1       %5, %3    punpckl%2   %3, %4    punpckh%2   %5, %4%endmacro    %macro SBUTTERFLY2 5  ; not really needed, but allows transpose4x4 to not shuffle registers    mov%1       %5, %3    punpckh%2   %3, %4    punpckl%2   %5, %4%endmacro%macro TRANSPOSE4x4D 5   ; ABCD-T -> ADTC    SBUTTERFLY dqa, dq,  %1, %2, %5    SBUTTERFLY dqa, dq,  %3, %4, %2    SBUTTERFLY dqa, qdq, %1, %3, %4    SBUTTERFLY dqa, qdq, %5, %2, %3%endmacro%macro TRANSPOSE2x4x4W 5   ; ABCD-T -> ABCD    SBUTTERFLY  dqa, wd,  %1, %2, %5    SBUTTERFLY  dqa, wd,  %3, %4, %2    SBUTTERFLY  dqa, dq,  %1, %3, %4    SBUTTERFLY2 dqa, dq,  %5, %2, %3    SBUTTERFLY  dqa, qdq, %1, %3, %2    SBUTTERFLY2 dqa, qdq, %4, %5, %3%endmacro%macro LOAD_DIFF_8P 4  ; MMP, MMT, [pix1], [pix2]    movq        %1, %3    movq        %2, %4    punpcklbw   %1, %2    punpcklbw   %2, %2    psubw       %1, %2%endmacro%macro SUM4x4_SSE2 4    ; 02 13 junk sum    pxor    %3, %3    psubw   %3, %1    pmaxsw  %1, %3    pxor    %3, %3    psubw   %3, %2    pmaxsw  %2, %3    paddusw %4, %1    paddusw %4, %2%endmacro%macro SUM8x4_SSE2 7    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum    pxor    %3, %3    pxor    %6, %6    psubw   %3, %1    psubw   %6, %4    pmaxsw  %1, %3    pmaxsw  %4, %6    pxor    %3, %3    pxor    %6, %6    psubw   %3, %2    psubw   %6, %5    pmaxsw  %2, %3    pmaxsw  %5, %6    paddusw %1, %2    paddusw %4, %5    paddusw %7, %1    paddusw %7, %4%endmacro%macro SUM8x4_SSSE3 7    ; a02 a13 . b02 b13 . sum    pabsw   %1, %1    pabsw   %2, %2    pabsw   %4, %4    pabsw   %5, %5    paddusw %1, %2    paddusw %4, %5    paddusw %7, %1    paddusw %7, %4%endmacro%macro SATD_TWO_SSE2 0    LOAD_DIFF_8P xmm0, xmm4, [eax], [ecx]    LOAD_DIFF_8P xmm1, xmm5, [eax+ebx], [ecx+edx]    lea          eax,  [eax+2*ebx]    lea          ecx,  [ecx+2*edx]    LOAD_DIFF_8P xmm2, xmm4, [eax], [ecx]    LOAD_DIFF_8P xmm3, xmm5, [eax+ebx], [ecx+edx]    lea          eax,  [eax+2*ebx]    lea          ecx,  [ecx+2*edx]    HADAMARD1x4       xmm0, xmm1, xmm2, xmm3    TRANSPOSE2x4x4W   xmm0, xmm1, xmm2, xmm3, xmm4    HADAMARD1x4       xmm0, xmm1, xmm2, xmm3    SUM8x4            xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6%endmacro%macro SATD_START 0    push    ebx    mov     eax,    [esp+ 8]    ; pix1    mov     ebx,    [esp+12]    ; stride1    mov     ecx,    [esp+16]    ; pix2    mov     edx,    [esp+20]    ; stride2    pxor    xmm6,    xmm6%endmacro%macro SATD_END 0    ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.    psrlw   xmm6, 1    HADDW   xmm6, xmm7    movd    eax,  xmm6    pop     ebx    ret%endmacro%macro SATDS 1;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x16_%1    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    mov     eax,    [esp+ 8]    mov     ecx,    [esp+16]    add     eax,    8    add     ecx,    8    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_END;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x16_%1    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_END;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_16x8_%1    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    mov     eax,    [esp+ 8]    mov     ecx,    [esp+16]    add     eax,    8    add     ecx,    8    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_END;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x8_%1    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_END;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_satd_8x4_%1    SATD_START    SATD_TWO_SSE2    SATD_END%endmacro ; SATDS%define SUM8x4 SUM8x4_SSE2SATDS sse2%ifdef HAVE_SSE3%define SUM8x4 SUM8x4_SSSE3SATDS ssse3%endif;-----------------------------------------------------------------------------; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,;                                       const uint8_t *pix2, int stride2, int sums[2][4] );-----------------------------------------------------------------------------cglobal x264_pixel_ssim_4x4x2_core_sse2    push      ebx    mov       eax,  [esp+ 8]    mov       ebx,  [esp+12]    mov       ecx,  [esp+16]    mov       edx,  [esp+20]    pxor      xmm0, xmm0    pxor      xmm1, xmm1    pxor      xmm2, xmm2    pxor      xmm3, xmm3    pxor      xmm4, xmm4%rep 4    movq      xmm5, [eax]    movq      xmm6, [ecx]    punpcklbw xmm5, xmm0    punpcklbw xmm6, xmm0    paddw     xmm1, xmm5    paddw     xmm2, xmm6    movdqa    xmm7, xmm5    pmaddwd   xmm5, xmm5    pmaddwd   xmm7, xmm6    pmaddwd   xmm6, xmm6    paddd     xmm3, xmm5    paddd     xmm4, xmm7    paddd     xmm3, xmm6    add       eax,  ebx    add       ecx,  edx%endrep    ; PHADDW xmm1, xmm2    ; PHADDD xmm3, xmm4    mov       eax,  [esp+24]    picgetgot ebx    movdqa    xmm7, [pw_1 GOT_ebx]    pshufd    xmm5, xmm3, 0xB1    pmaddwd   xmm1, xmm7    pmaddwd   xmm2, xmm7    pshufd    xmm6, xmm4, 0xB1    packssdw  xmm1, xmm2    paddd     xmm3, xmm5    pmaddwd   xmm1, xmm7    paddd     xmm4, xmm6    pshufd    xmm1, xmm1, 0xD8    movdqa    xmm5, xmm3    punpckldq xmm3, xmm4    punpckhdq xmm5, xmm4    movq      [eax+ 0], xmm1    movq      [eax+ 8], xmm3    psrldq    xmm1, 8    movq      [eax+16], xmm1    movq      [eax+24], xmm5    pop       ebx    ret;-----------------------------------------------------------------------------; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width );-----------------------------------------------------------------------------cglobal x264_pixel_ssim_end4_sse2    mov      eax,  [esp+ 4]    mov      ecx,  [esp+ 8]    mov      edx,  [esp+12]    picpush  ebx    picgetgot ebx    movdqa   xmm0, [eax+ 0]    movdqa   xmm1, [eax+16]    movdqa   xmm2, [eax+32]    movdqa   xmm3, [eax+48]    movdqa   xmm4, [eax+64]    paddd    xmm0, [ecx+ 0]    paddd    xmm1, [ecx+16]    paddd    xmm2, [ecx+32]    paddd    xmm3, [ecx+48]    paddd    xmm4, [ecx+64]    paddd    xmm0, xmm1    paddd    xmm1, xmm2    paddd    xmm2, xmm3    paddd    xmm3, xmm4    movdqa   xmm5, [ssim_c1 GOT_ebx]    movdqa   xmm6, [ssim_c2 GOT_ebx]    TRANSPOSE4x4D  xmm0, xmm1, xmm2, xmm3, xmm4;   s1=mm0, s2=mm3, ss=mm4, s12=mm2    movdqa   xmm1, xmm3    pslld    xmm3, 16    pmaddwd  xmm1, xmm0  ; s1*s2    por      xmm0, xmm3    pmaddwd  xmm0, xmm0  ; s1*s1 + s2*s2    pslld    xmm1, 1    pslld    xmm2, 7    pslld    xmm4, 6    psubd    xmm2, xmm1  ; covar*2    psubd    xmm4, xmm0  ; vars    paddd    xmm0, xmm5    paddd    xmm1, xmm5    paddd    xmm2, xmm6    paddd    xmm4, xmm6    cvtdq2ps xmm0, xmm0  ; (float)(s1*s1 + s2*s2 + ssim_c1)    cvtdq2ps xmm1, xmm1  ; (float)(s1*s2*2 + ssim_c1)    cvtdq2ps xmm2, xmm2  ; (float)(covar*2 + ssim_c2)    cvtdq2ps xmm4, xmm4  ; (float)(vars + ssim_c2)    mulps    xmm1, xmm2    mulps    xmm0, xmm4    divps    xmm1, xmm0  ; ssim    neg      edx    movdqu   xmm3, [mask_ff + edx*4 + 16 GOT_ebx]    pand     xmm1, xmm3    movhlps  xmm0, xmm1    addps    xmm0, xmm1    pshuflw  xmm1, xmm0, 0xE    addss    xmm0, xmm1    movd     [picesp+4], xmm0    fld      dword [picesp+4]    picpop   ebx    ret

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -