⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pixel-sse2.asm

📁 linux下编译已经通过
💻 ASM
📖 第 1 页 / 共 2 页
字号:
;*****************************************************************************;* pixel-sse2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* Authors: Alex Izvorski <aizvorksi@gmail.com>;*          Loren Merritt <lorenm@u.washington.edu>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.;*****************************************************************************BITS 64;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "amd64inc.asm"SECTION .rodata align=16pw_1:    times 8 dw 1ssim_c1: times 4 dd 416    ; .01*.01*255*255*64ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63mask_ff: times 16 db 0xff         times 16 db 0SECTION .text%macro HADDD 2 ; sum junk    movhlps %2, %1    paddd   %1, %2    pshuflw %2, %1, 0xE     paddd   %1, %2%endmacro%macro HADDW 2    pmaddwd %1, [pw_1 GLOBAL]    HADDD   %1, %2%endmacro%macro SAD_INC_4x16P_SSE2 0    movdqu  xmm1,   [rdx]    movdqu  xmm2,   [rdx+rcx]    lea     rdx,    [rdx+2*rcx]    movdqu  xmm3,   [rdx]    movdqu  xmm4,   [rdx+rcx]    psadbw  xmm1,   [rdi]    psadbw  xmm2,   [rdi+rsi]    lea     rdi,    [rdi+2*rsi]    psadbw  xmm3,   [rdi]    psadbw  xmm4,   [rdi+rsi]    lea     rdi,    [rdi+2*rsi]    lea     rdx,    [rdx+2*rcx]    paddw   xmm1,   xmm2    paddw   xmm3,   xmm4    paddw   xmm0,   xmm1    paddw   xmm0,   xmm3%endmacro%macro SAD_END_SSE2 0    movhlps xmm1, xmm0    paddw   xmm0, xmm1    movd    eax,  xmm0    ret%endmacro;-----------------------------------------------------------------------------;   int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_sad_16x16_sse2    movdqu xmm0, [rdx]    movdqu xmm1, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    movdqu xmm2, [rdx]    movdqu xmm3, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    psadbw xmm0, [rdi]    psadbw xmm1, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    movdqu xmm4, [rdx]    paddw  xmm0, xmm1    psadbw xmm2, [rdi]    psadbw xmm3, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    movdqu xmm5, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    paddw  xmm2, xmm3    movdqu xmm6, [rdx]    movdqu xmm7, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    paddw  xmm0, xmm2    psadbw xmm4, [rdi]    psadbw xmm5, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    movdqu xmm1, [rdx]    paddw  xmm4, xmm5    psadbw xmm6, [rdi]    psadbw xmm7, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    movdqu xmm2, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    paddw  xmm6, xmm7    movdqu xmm3, [rdx]    paddw  xmm0, xmm4    movdqu xmm4, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    paddw  xmm0, xmm6    psadbw xmm1, [rdi]    psadbw xmm2, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    movdqu xmm5, [rdx]    paddw  xmm1, xmm2    psadbw xmm3, [rdi]    psadbw xmm4, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    movdqu xmm6, [rdx+rcx]    lea    rdx,  [rdx+2*rcx]    paddw  xmm3, xmm4    movdqu xmm7, [rdx]    paddw  xmm0, xmm1    movdqu xmm1, [rdx+rcx]    paddw  xmm0, xmm3    psadbw xmm5, [rdi]    psadbw xmm6, [rdi+rsi]    lea    rdi,  [rdi+2*rsi]    paddw  xmm5, xmm6    psadbw xmm7, [rdi]    psadbw xmm1, [rdi+rsi]    paddw  xmm7, xmm1    paddw  xmm0, xmm5    paddw  xmm0, xmm7    SAD_END_SSE2;-----------------------------------------------------------------------------;   int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_sad_16x8_sse2    pxor    xmm0,   xmm0    SAD_INC_4x16P_SSE2    SAD_INC_4x16P_SSE2    SAD_END_SSE2; sad x3 / x4%macro SAD_X3_START_1x16P 0    movdqa xmm3, [parm1q]    movdqu xmm0, [parm2q]    movdqu xmm1, [parm3q]    movdqu xmm2, [parm4q]    psadbw xmm0, xmm3    psadbw xmm1, xmm3    psadbw xmm2, xmm3%endmacro%macro SAD_X3_1x16P 2    movdqa xmm3, [parm1q+%1]    movdqu xmm4, [parm2q+%2]    movdqu xmm5, [parm3q+%2]    movdqu xmm6, [parm4q+%2]    psadbw xmm4, xmm3    psadbw xmm5, xmm3    psadbw xmm6, xmm3    paddw  xmm0, xmm4    paddw  xmm1, xmm5    paddw  xmm2, xmm6%endmacro%macro SAD_X3_2x16P 1%if %1    SAD_X3_START_1x16P%else    SAD_X3_1x16P 0, 0%endif    SAD_X3_1x16P FENC_STRIDE, parm5q    add  parm1q, 2*FENC_STRIDE    lea  parm2q, [parm2q+2*parm5q]    lea  parm3q, [parm3q+2*parm5q]    lea  parm4q, [parm4q+2*parm5q]%endmacro%macro SAD_X4_START_1x16P 0    movdqa xmm7, [parm1q]    movdqu xmm0, [parm2q]    movdqu xmm1, [parm3q]    movdqu xmm2, [parm4q]    movdqu xmm3, [parm5q]    psadbw xmm0, xmm7    psadbw xmm1, xmm7    psadbw xmm2, xmm7    psadbw xmm3, xmm7%endmacro%macro SAD_X4_1x16P 2    movdqa xmm7, [parm1q+%1]    movdqu xmm4, [parm2q+%2]    movdqu xmm5, [parm3q+%2]    movdqu xmm6, [parm4q+%2]    movdqu xmm8, [parm5q+%2]    psadbw xmm4, xmm7    psadbw xmm5, xmm7    psadbw xmm6, xmm7    psadbw xmm8, xmm7    paddw  xmm0, xmm4    paddw  xmm1, xmm5    paddw  xmm2, xmm6    paddw  xmm3, xmm8%endmacro%macro SAD_X4_2x16P 1%if %1    SAD_X4_START_1x16P%else    SAD_X4_1x16P 0, 0%endif    SAD_X4_1x16P FENC_STRIDE, parm6q    add  parm1q, 2*FENC_STRIDE    lea  parm2q, [parm2q+2*parm6q]    lea  parm3q, [parm3q+2*parm6q]    lea  parm4q, [parm4q+2*parm6q]    lea  parm5q, [parm5q+2*parm6q]%endmacro%macro SAD_X3_END 0    movhlps xmm4, xmm0    movhlps xmm5, xmm1    movhlps xmm6, xmm2    paddw   xmm0, xmm4    paddw   xmm1, xmm5    paddw   xmm2, xmm6    movd [parm6q+0], xmm0    movd [parm6q+4], xmm1    movd [parm6q+8], xmm2    ret%endmacro%macro SAD_X4_END 0    mov      rax, parm7q    movhlps xmm4, xmm0    movhlps xmm5, xmm1    movhlps xmm6, xmm2    movhlps xmm7, xmm3    paddw   xmm0, xmm4    paddw   xmm1, xmm5    paddw   xmm2, xmm6    paddw   xmm3, xmm7    movd [rax+0], xmm0    movd [rax+4], xmm1    movd [rax+8], xmm2    movd [rax+12], xmm3    ret%endmacro;-----------------------------------------------------------------------------;  void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,;                                     uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X 3cglobal x264_pixel_sad_x%1_%2x%3_sse2    SAD_X%1_2x%2P 1%rep %3/2-1    SAD_X%1_2x%2P 0%endrep    SAD_X%1_END%endmacroSAD_X 3, 16, 16SAD_X 3, 16,  8SAD_X 4, 16, 16SAD_X 4, 16,  8; ssd%macro SSD_INC_2x16P_SSE2 0    movdqu  xmm1,   [rdi]    movdqu  xmm2,   [rdx]    movdqu  xmm3,   [rdi+rsi]    movdqu  xmm4,   [rdx+rcx]    movdqa  xmm5,   xmm1    movdqa  xmm6,   xmm3    psubusb xmm1,   xmm2    psubusb xmm3,   xmm4    psubusb xmm2,   xmm5    psubusb xmm4,   xmm6    por     xmm1,   xmm2    por     xmm3,   xmm4    movdqa  xmm2,   xmm1    movdqa  xmm4,   xmm3    punpcklbw xmm1, xmm7    punpckhbw xmm2, xmm7    punpcklbw xmm3, xmm7    punpckhbw xmm4, xmm7    pmaddwd xmm1,   xmm1    pmaddwd xmm2,   xmm2    pmaddwd xmm3,   xmm3    pmaddwd xmm4,   xmm4    lea     rdi,    [rdi+2*rsi]    lea     rdx,    [rdx+2*rcx]    paddd   xmm1,   xmm2    paddd   xmm3,   xmm4    paddd   xmm0,   xmm1    paddd   xmm0,   xmm3%endmacro%macro SSD_START_SSE2 0    pxor    xmm7,   xmm7        ; zero    pxor    xmm0,   xmm0        ; mm0 holds the sum%endmacro%macro SSD_END_SSE2 0    HADDD   xmm0, xmm1    movd    eax,  xmm0    ret%endmacro;-----------------------------------------------------------------------------;   int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x16_sse2    SSD_START_SSE2%rep 8    SSD_INC_2x16P_SSE2%endrep    SSD_END_SSE2;-----------------------------------------------------------------------------;   int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x8_sse2    SSD_START_SSE2%rep 4    SSD_INC_2x16P_SSE2%endrep    SSD_END_SSE2%macro SUMSUB_BADC 4    paddw   %1, %2    paddw   %3, %4    paddw   %2, %2    paddw   %4, %4    psubw   %2, %1    psubw   %4, %3%endmacro%macro HADAMARD1x4 4    SUMSUB_BADC %1, %2, %3, %4    SUMSUB_BADC %1, %3, %2, %4%endmacro%macro HADAMARD1x8 8    SUMSUB_BADC %1, %5, %2, %6    SUMSUB_BADC %3, %7, %4, %8    SUMSUB_BADC %1, %3, %2, %4    SUMSUB_BADC %5, %7, %6, %8    SUMSUB_BADC %1, %2, %3, %4    SUMSUB_BADC %5, %6, %7, %8%endmacro;;; row transform not used, because phaddw is much slower than paddw on a Conroe;%macro PHSUMSUB 3;    movdqa  %3, %1;    phaddw  %1, %2;    phsubw  %3, %2;%endmacro;%macro HADAMARD4x1_SSSE3 5  ; ABCD-T -> ADTC;    PHSUMSUB    %1, %2, %5;    PHSUMSUB    %3, %4, %2;    PHSUMSUB    %1, %3, %4;    PHSUMSUB    %5, %2, %3;%endmacro%macro SBUTTERFLY 5    mov%1       %5, %3    punpckl%2   %3, %4    punpckh%2   %5, %4%endmacro%macro SBUTTERFLY2 5  ; not really needed, but allows transpose4x4 to not shuffle registers    mov%1       %5, %3    punpckh%2   %3, %4    punpckl%2   %5, %4%endmacro%macro TRANSPOSE4x4D 5   ; ABCD-T -> ADTC    SBUTTERFLY dqa, dq,  %1, %2, %5    SBUTTERFLY dqa, dq,  %3, %4, %2    SBUTTERFLY dqa, qdq, %1, %3, %4    SBUTTERFLY dqa, qdq, %5, %2, %3%endmacro%macro TRANSPOSE2x4x4W 5   ; ABCD-T -> ABCD    SBUTTERFLY  dqa, wd,  %1, %2, %5    SBUTTERFLY  dqa, wd,  %3, %4, %2    SBUTTERFLY  dqa, dq,  %1, %3, %4    SBUTTERFLY2 dqa, dq,  %5, %2, %3    SBUTTERFLY  dqa, qdq, %1, %3, %2    SBUTTERFLY2 dqa, qdq, %4, %5, %3%endmacro%macro TRANSPOSE8x8 9   ; ABCDEFGH-T -> AFHDTECB    SBUTTERFLY dqa, wd, %1, %2, %9

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -