⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pixel-sse2.asm

📁 DM642 H.264 codec DM642 H.264 codec DM642 H.264 codec DM642 H.264 codec
💻 ASM
📖 第 1 页 / 共 2 页
字号:
    movdqa  xmm4,   xmm3    punpcklbw xmm1, xmm7    punpckhbw xmm2, xmm7    punpcklbw xmm3, xmm7    punpckhbw xmm4, xmm7    pmaddwd xmm1,   xmm1    pmaddwd xmm2,   xmm2    pmaddwd xmm3,   xmm3    pmaddwd xmm4,   xmm4    lea     eax,    [eax+2*ebx]    lea     ecx,    [ecx+2*edx]    paddd   xmm1,   xmm2    paddd   xmm3,   xmm4    paddd   xmm0,   xmm1    paddd   xmm0,   xmm3%endmacro%macro SSD_START_SSE2 0    push    ebx    mov     eax,    [esp+ 8]    ; pix1    mov     ebx,    [esp+12]    ; stride1    mov     ecx,    [esp+16]    ; pix2    mov     edx,    [esp+20]    ; stride2    pxor    xmm7,   xmm7         ; zero    pxor    xmm0,   xmm0         ; mm0 holds the sum%endmacro%macro SSD_END_SSE2 0    movdqa  xmm1,   xmm0    psrldq  xmm1,    8    paddd   xmm0,   xmm1    movdqa  xmm1,   xmm0    psrldq  xmm1,    4    paddd   xmm0,   xmm1    movd    eax,    xmm0    pop ebx    ret%endmacroALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_ssd_16x16_sse2:    SSD_START_SSE2%rep 8    SSD_INC_2x16P_SSE2%endrep    SSD_END_SSE2ALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_ssd_16x8_sse2:    SSD_START_SSE2%rep 4    SSD_INC_2x16P_SSE2%endrep    SSD_END_SSE2; %1=(row2, row0) %2=(row3, row1) %3=junk; output in %1=(row3, row0) and %3=(row2, row1)%macro HADAMARD4x4_SSE2 3    movdqa     %3, %1    paddw      %1, %2    psubw      %3, %2    movdqa     %2, %1    punpcklqdq %1, %3    punpckhqdq %2, %3    movdqa     %3, %1    paddw      %1, %2    psubw      %3, %2%endmacro;;; two HADAMARD4x4_SSE2 running side-by-side%macro HADAMARD4x4_TWO_SSE2 6    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)    movdqa     %3, %1    movdqa     %6, %4    paddw      %1, %2    paddw      %4, %5    psubw      %3, %2    psubw      %6, %5    movdqa     %2, %1    movdqa     %5, %4    punpcklqdq %1, %3    punpcklqdq %4, %6    punpckhqdq %2, %3    punpckhqdq %5, %6    movdqa     %3, %1    movdqa     %6, %4    paddw      %1, %2    paddw      %4, %5    psubw      %3, %2    psubw      %6, %5%endmacro%macro TRANSPOSE4x4_TWIST_SSE2 3    ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2    movdqa     %3, %1    punpcklwd  %1, %2    punpckhwd  %2, %3             ; backwards because the high quadwords are already swapped    movdqa     %3, %1    punpckldq  %1, %2    punpckhdq  %3, %2    movdqa     %2, %1    punpcklqdq %1, %3    punpckhqdq %2, %3%endmacro;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side%macro TRANSPOSE4x4_TWIST_TWO_SSE2 6    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)    movdqa     %3, %1    movdqa     %6, %4    punpcklwd  %1, %2    punpcklwd  %4, %5    punpckhwd  %2, %3    punpckhwd  %5, %6    movdqa     %3, %1    movdqa     %6, %4    punpckldq  %1, %2    punpckldq  %4, %5    punpckhdq  %3, %2    punpckhdq  %6, %5    movdqa     %2, %1    movdqa     %5, %4    punpcklqdq %1, %3    punpcklqdq %4, %6    punpckhqdq %2, %3    punpckhqdq %5, %6%endmacro;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order;;; destroys xmm2, 3;;; the value in xmm7 doesn't matter: it's only subtracted from itself%macro LOAD4x8_DIFF_SSE2 0    movq      xmm0, [eax]    movq      xmm4, [ecx]    punpcklbw xmm0, xmm7    punpcklbw xmm4, xmm7    psubw     xmm0, xmm4    movq      xmm1, [eax+ebx]    movq      xmm5, [ecx+edx]    lea       eax,  [eax+2*ebx]    lea       ecx,  [ecx+2*edx]    punpcklbw xmm1, xmm7    punpcklbw xmm5, xmm7    psubw     xmm1, xmm5    movq       xmm2, [eax]    movq       xmm4, [ecx]    punpcklbw  xmm2, xmm7    punpcklbw  xmm4, xmm7    psubw      xmm2, xmm4    movdqa     xmm4, xmm0    punpcklqdq xmm0, xmm2        ; rows 0 and 2    punpckhqdq xmm4, xmm2        ; next 4x4 rows 0 and 2    movq       xmm3, [eax+ebx]    movq       xmm5, [ecx+edx]    lea        eax,  [eax+2*ebx]    lea        ecx,  [ecx+2*edx]    punpcklbw  xmm3, xmm7    punpcklbw  xmm5, xmm7    psubw      xmm3, xmm5    movdqa     xmm5, xmm1    punpcklqdq xmm1, xmm3        ; rows 1 and 3    punpckhqdq xmm5, xmm3        ; next 4x4 rows 1 and 3%endmacro%macro SUM4x4_SSE2 4    ; 02 13 junk sum    pxor    %3, %3    psubw   %3, %1    pmaxsw  %1, %3    pxor    %3, %3    psubw   %3, %2    pmaxsw  %2, %3    paddusw %4, %1    paddusw %4, %2%endmacro;;; two SUM4x4_SSE2 running side-by-side%macro SUM4x4_TWO_SSE2 7    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum    pxor    %3, %3    pxor    %6, %6    psubw   %3, %1    psubw   %6, %4    pmaxsw  %1, %3    pmaxsw  %4, %6    pxor    %3, %3    pxor    %6, %6    psubw   %3, %2    psubw   %6, %5    pmaxsw  %2, %3    pmaxsw  %5, %6    paddusw %1, %2    paddusw %4, %5    paddusw %7, %1    paddusw %7, %4%endmacro%macro SUM_MM_SSE2 2    ; sum junk    ; ebx is no longer used at this point, so no push needed    picgetgot ebx    ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.    psrlw   %1, 1    movdqa  %2, %1    psrldq  %1, 2    paddusw %1, %2    pand    %1, [pd_0000ffff GOT_ebx]    movdqa  %2, %1    psrldq  %1, 4    paddd   %1, %2    movdqa  %2, %1    psrldq  %1, 8    paddd   %1, %2    movd    eax,%1%endmacro%macro SATD_TWO_SSE2 0    LOAD4x8_DIFF_SSE2    HADAMARD4x4_TWO_SSE2        xmm0, xmm1, xmm2, xmm4, xmm5, xmm3    TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5    HADAMARD4x4_TWO_SSE2        xmm0, xmm2, xmm1, xmm4, xmm3, xmm5    SUM4x4_TWO_SSE2             xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6%endmacro%macro SATD_START 0    push    ebx    mov     eax,    [esp+ 8]    ; pix1    mov     ebx,    [esp+12]    ; stride1    mov     ecx,    [esp+16]    ; pix2    mov     edx,    [esp+20]    ; stride2    pxor    xmm6,    xmm6%endmacro%macro SATD_END 0    SUM_MM_SSE2  xmm6, xmm7    pop     ebx    ret%endmacroALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_16x16_sse2:    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    mov     eax,    [esp+ 8]    mov     ecx,    [esp+16]    lea     eax,    [eax+8]    lea     ecx,    [ecx+8]    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_ENDALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x16_sse2:    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_ENDALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_16x8_sse2:    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    mov     eax,    [esp+ 8]    mov     ecx,    [esp+16]    lea     eax,    [eax+8]    lea     ecx,    [ecx+8]    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_ENDALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x8_sse2:    SATD_START    SATD_TWO_SSE2    SATD_TWO_SSE2    SATD_ENDALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x4_sse2:    SATD_START    SATD_TWO_SSE2    SATD_END

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -