📄 pixel-sse2.asm
字号:
;*****************************************************************************;* pixel-sse2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* Authors: Alex Izvorski <aizvorksi@gmail.com>;* Loren Merritt <lorenm@u.washington.edu>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.;*****************************************************************************BITS 64;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "amd64inc.asm"SECTION .rodata align=16pw_1: times 8 dw 1ssim_c1: times 4 dd 416 ; .01*.01*255*255*64ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63mask_ff: times 16 db 0xff times 16 db 0SECTION .text%macro HADDD 2 ; sum junk movhlps %2, %1 paddd %1, %2 pshuflw %2, %1, 0xE paddd %1, %2%endmacro%macro HADDW 2 pmaddwd %1, [pw_1 GLOBAL] HADDD %1, %2%endmacro%macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [rdx] movdqu xmm2, [rdx+rcx] lea rdx, [rdx+2*rcx] movdqu xmm3, [rdx] movdqu xmm4, [rdx+rcx] psadbw xmm1, [rdi] psadbw xmm2, [rdi+rsi] lea rdi, [rdi+2*rsi] psadbw xmm3, [rdi] psadbw xmm4, [rdi+rsi] lea rdi, [rdi+2*rsi] lea rdx, [rdx+2*rcx] paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm0, xmm1 paddw xmm0, xmm3%endmacro%macro SAD_END_SSE2 0 movhlps xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 ret%endmacro;-----------------------------------------------------------------------------; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_sad_16x16_sse2 movdqu xmm0, [rdx] movdqu xmm1, [rdx+rcx] lea rdx, [rdx+2*rcx] movdqu xmm2, [rdx] movdqu xmm3, [rdx+rcx] lea rdx, [rdx+2*rcx] psadbw xmm0, [rdi] psadbw xmm1, [rdi+rsi] lea rdi, [rdi+2*rsi] movdqu xmm4, [rdx] paddw xmm0, xmm1 psadbw xmm2, [rdi] psadbw xmm3, [rdi+rsi] lea rdi, [rdi+2*rsi] movdqu xmm5, [rdx+rcx] lea rdx, [rdx+2*rcx] paddw xmm2, xmm3 movdqu xmm6, [rdx] movdqu xmm7, [rdx+rcx] lea rdx, [rdx+2*rcx] paddw xmm0, xmm2 psadbw xmm4, [rdi] psadbw xmm5, [rdi+rsi] lea rdi, [rdi+2*rsi] movdqu xmm1, [rdx] paddw xmm4, xmm5 psadbw xmm6, [rdi] psadbw xmm7, [rdi+rsi] lea rdi, [rdi+2*rsi] movdqu xmm2, [rdx+rcx] lea rdx, [rdx+2*rcx] paddw xmm6, xmm7 movdqu xmm3, [rdx] paddw xmm0, xmm4 movdqu xmm4, [rdx+rcx] lea rdx, [rdx+2*rcx] paddw xmm0, xmm6 psadbw xmm1, [rdi] psadbw xmm2, [rdi+rsi] lea rdi, [rdi+2*rsi] movdqu xmm5, [rdx] paddw xmm1, xmm2 psadbw xmm3, [rdi] psadbw xmm4, [rdi+rsi] lea rdi, [rdi+2*rsi] movdqu xmm6, [rdx+rcx] lea rdx, [rdx+2*rcx] paddw xmm3, xmm4 movdqu xmm7, [rdx] paddw xmm0, xmm1 movdqu xmm1, [rdx+rcx] paddw xmm0, xmm3 psadbw xmm5, [rdi] psadbw xmm6, [rdi+rsi] lea rdi, [rdi+2*rsi] paddw xmm5, xmm6 psadbw xmm7, [rdi] psadbw xmm1, [rdi+rsi] paddw xmm7, xmm1 paddw xmm0, xmm5 paddw xmm0, xmm7 SAD_END_SSE2;-----------------------------------------------------------------------------; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_sad_16x8_sse2 pxor xmm0, xmm0 SAD_INC_4x16P_SSE2 SAD_INC_4x16P_SSE2 SAD_END_SSE2; sad x3 / x4%macro SAD_X3_START_1x16P 0 movdqa xmm3, [parm1q] movdqu xmm0, [parm2q] movdqu xmm1, [parm3q] movdqu xmm2, [parm4q] psadbw xmm0, xmm3 psadbw xmm1, xmm3 psadbw xmm2, xmm3%endmacro%macro SAD_X3_1x16P 2 movdqa xmm3, [parm1q+%1] movdqu xmm4, [parm2q+%2] movdqu xmm5, [parm3q+%2] movdqu xmm6, [parm4q+%2] psadbw xmm4, xmm3 psadbw xmm5, xmm3 psadbw xmm6, xmm3 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6%endmacro%macro SAD_X3_2x16P 1%if %1 SAD_X3_START_1x16P%else SAD_X3_1x16P 0, 0%endif SAD_X3_1x16P FENC_STRIDE, parm5q add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm5q] lea parm3q, [parm3q+2*parm5q] lea parm4q, [parm4q+2*parm5q]%endmacro%macro SAD_X4_START_1x16P 0 movdqa xmm7, [parm1q] movdqu xmm0, [parm2q] movdqu xmm1, [parm3q] movdqu xmm2, [parm4q] movdqu xmm3, [parm5q] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 psadbw xmm3, xmm7%endmacro%macro SAD_X4_1x16P 2 movdqa xmm7, [parm1q+%1] movdqu xmm4, [parm2q+%2] movdqu xmm5, [parm3q+%2] movdqu xmm6, [parm4q+%2] movdqu xmm8, [parm5q+%2] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 psadbw xmm8, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm8%endmacro%macro SAD_X4_2x16P 1%if %1 SAD_X4_START_1x16P%else SAD_X4_1x16P 0, 0%endif SAD_X4_1x16P FENC_STRIDE, parm6q add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm6q] lea parm3q, [parm3q+2*parm6q] lea parm4q, [parm4q+2*parm6q] lea parm5q, [parm5q+2*parm6q]%endmacro%macro SAD_X3_END 0 movhlps xmm4, xmm0 movhlps xmm5, xmm1 movhlps xmm6, xmm2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 movd [parm6q+0], xmm0 movd [parm6q+4], xmm1 movd [parm6q+8], xmm2 ret%endmacro%macro SAD_X4_END 0 mov rax, parm7q movhlps xmm4, xmm0 movhlps xmm5, xmm1 movhlps xmm6, xmm2 movhlps xmm7, xmm3 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm7 movd [rax+0], xmm0 movd [rax+4], xmm1 movd [rax+8], xmm2 movd [rax+12], xmm3 ret%endmacro;-----------------------------------------------------------------------------; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,; uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X 3cglobal x264_pixel_sad_x%1_%2x%3_sse2 SAD_X%1_2x%2P 1%rep %3/2-1 SAD_X%1_2x%2P 0%endrep SAD_X%1_END%endmacroSAD_X 3, 16, 16SAD_X 3, 16, 8SAD_X 4, 16, 16SAD_X 4, 16, 8; ssd%macro SSD_INC_2x16P_SSE2 0 movdqu xmm1, [rdi] movdqu xmm2, [rdx] movdqu xmm3, [rdi+rsi] movdqu xmm4, [rdx+rcx] movdqa xmm5, xmm1 movdqa xmm6, xmm3 psubusb xmm1, xmm2 psubusb xmm3, xmm4 psubusb xmm2, xmm5 psubusb xmm4, xmm6 por xmm1, xmm2 por xmm3, xmm4 movdqa xmm2, xmm1 movdqa xmm4, xmm3 punpcklbw xmm1, xmm7 punpckhbw xmm2, xmm7 punpcklbw xmm3, xmm7 punpckhbw xmm4, xmm7 pmaddwd xmm1, xmm1 pmaddwd xmm2, xmm2 pmaddwd xmm3, xmm3 pmaddwd xmm4, xmm4 lea rdi, [rdi+2*rsi] lea rdx, [rdx+2*rcx] paddd xmm1, xmm2 paddd xmm3, xmm4 paddd xmm0, xmm1 paddd xmm0, xmm3%endmacro%macro SSD_START_SSE2 0 pxor xmm7, xmm7 ; zero pxor xmm0, xmm0 ; mm0 holds the sum%endmacro%macro SSD_END_SSE2 0 HADDD xmm0, xmm1 movd eax, xmm0 ret%endmacro;-----------------------------------------------------------------------------; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x16_sse2 SSD_START_SSE2%rep 8 SSD_INC_2x16P_SSE2%endrep SSD_END_SSE2;-----------------------------------------------------------------------------; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------cglobal x264_pixel_ssd_16x8_sse2 SSD_START_SSE2%rep 4 SSD_INC_2x16P_SSE2%endrep SSD_END_SSE2%macro SUMSUB_BADC 4 paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3%endmacro%macro HADAMARD1x4 4 SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %1, %3, %2, %4%endmacro%macro HADAMARD1x8 8 SUMSUB_BADC %1, %5, %2, %6 SUMSUB_BADC %3, %7, %4, %8 SUMSUB_BADC %1, %3, %2, %4 SUMSUB_BADC %5, %7, %6, %8 SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %5, %6, %7, %8%endmacro;;; row transform not used, because phaddw is much slower than paddw on a Conroe;%macro PHSUMSUB 3; movdqa %3, %1; phaddw %1, %2; phsubw %3, %2;%endmacro;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC; PHSUMSUB %1, %2, %5; PHSUMSUB %3, %4, %2; PHSUMSUB %1, %3, %4; PHSUMSUB %5, %2, %3;%endmacro%macro SBUTTERFLY 5 mov%1 %5, %3 punpckl%2 %3, %4 punpckh%2 %5, %4%endmacro%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers mov%1 %5, %3 punpckh%2 %3, %4 punpckl%2 %5, %4%endmacro%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC SBUTTERFLY dqa, dq, %1, %2, %5 SBUTTERFLY dqa, dq, %3, %4, %2 SBUTTERFLY dqa, qdq, %1, %3, %4 SBUTTERFLY dqa, qdq, %5, %2, %3%endmacro%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD SBUTTERFLY dqa, wd, %1, %2, %5 SBUTTERFLY dqa, wd, %3, %4, %2 SBUTTERFLY dqa, dq, %1, %3, %4 SBUTTERFLY2 dqa, dq, %5, %2, %3 SBUTTERFLY dqa, qdq, %1, %3, %2 SBUTTERFLY2 dqa, qdq, %4, %5, %3%endmacro%macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB SBUTTERFLY dqa, wd, %1, %2, %9
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -