📄 pixel-sse2.asm
字号:
;*****************************************************************************;* pixel-sse2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* Authors: Alex Izvorski <aizvorksi@gmail.com>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.;*****************************************************************************BITS 32;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "i386inc.asm"SECTION_RODATApd_0000ffff: times 4 dd 0x0000ffffSECTION .textcglobal x264_pixel_sad_16x16_sse2cglobal x264_pixel_sad_16x8_sse2cglobal x264_pixel_sad_x3_16x16_sse2cglobal x264_pixel_sad_x3_16x8_sse2cglobal x264_pixel_sad_x4_16x16_sse2cglobal x264_pixel_sad_x4_16x8_sse2cglobal x264_pixel_ssd_16x16_sse2cglobal x264_pixel_ssd_16x8_sse2cglobal x264_pixel_satd_8x4_sse2cglobal x264_pixel_satd_8x8_sse2cglobal x264_pixel_satd_16x8_sse2cglobal x264_pixel_satd_8x16_sse2cglobal x264_pixel_satd_16x16_sse2%macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [ecx] movdqu xmm2, [ecx+edx] lea ecx, [ecx+2*edx] movdqu xmm3, [ecx] movdqu xmm4, [ecx+edx] psadbw xmm1, [eax] psadbw xmm2, [eax+ebx] lea eax, [eax+2*ebx] psadbw xmm3, [eax] psadbw xmm4, [eax+ebx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] paddw xmm1, xmm2 paddw xmm3, xmm4 paddw xmm0, xmm1 paddw xmm0, xmm3%endmacro%macro SAD_START_SSE2 0 push ebx mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 mov edx, [esp+20] ; stride2%endmacro%macro SAD_END_SSE2 0 movdqa xmm1, xmm0 psrldq xmm0, 8 paddw xmm0, xmm1 movd eax, xmm0 pop ebx ret%endmacroALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_sad_16x16_sse2: SAD_START_SSE2 movdqu xmm0, [ecx] movdqu xmm1, [ecx+edx] lea ecx, [ecx+2*edx] movdqu xmm2, [ecx] movdqu xmm3, [ecx+edx] lea ecx, [ecx+2*edx] psadbw xmm0, [eax] psadbw xmm1, [eax+ebx] lea eax, [eax+2*ebx] movdqu xmm4, [ecx] paddw xmm0, xmm1 psadbw xmm2, [eax] psadbw xmm3, [eax+ebx] lea eax, [eax+2*ebx] movdqu xmm5, [ecx+edx] lea ecx, [ecx+2*edx] paddw xmm2, xmm3 movdqu xmm6, [ecx] movdqu xmm7, [ecx+edx] lea ecx, [ecx+2*edx] paddw xmm0, xmm2 psadbw xmm4, [eax] psadbw xmm5, [eax+ebx] lea eax, [eax+2*ebx] movdqu xmm1, [ecx] paddw xmm4, xmm5 psadbw xmm6, [eax] psadbw xmm7, [eax+ebx] lea eax, [eax+2*ebx] movdqu xmm2, [ecx+edx] lea ecx, [ecx+2*edx] paddw xmm6, xmm7 movdqu xmm3, [ecx] paddw xmm0, xmm4 movdqu xmm4, [ecx+edx] lea ecx, [ecx+2*edx] paddw xmm0, xmm6 psadbw xmm1, [eax] psadbw xmm2, [eax+ebx] lea eax, [eax+2*ebx] movdqu xmm5, [ecx] paddw xmm1, xmm2 psadbw xmm3, [eax] psadbw xmm4, [eax+ebx] lea eax, [eax+2*ebx] movdqu xmm6, [ecx+edx] lea ecx, [ecx+2*edx] paddw xmm3, xmm4 movdqu xmm7, [ecx] paddw xmm0, xmm1 movdqu xmm1, [ecx+edx] paddw xmm0, xmm3 psadbw xmm5, [eax] psadbw xmm6, [eax+ebx] lea eax, [eax+2*ebx] paddw xmm5, xmm6 psadbw xmm7, [eax] psadbw xmm1, [eax+ebx] paddw xmm7, xmm1 paddw xmm0, xmm5 paddw xmm0, xmm7 SAD_END_SSE2ALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_sad_16x8_sse2: SAD_START_SSE2 pxor xmm0, xmm0 SAD_INC_4x16P_SSE2 SAD_INC_4x16P_SSE2 SAD_END_SSE2%macro SAD_X3_START_1x16P 0 push edi push esi mov edi, [esp+12] mov eax, [esp+16] mov ecx, [esp+20] mov edx, [esp+24] mov esi, [esp+28] movdqa xmm3, [edi] movdqu xmm0, [eax] movdqu xmm1, [ecx] movdqu xmm2, [edx] psadbw xmm0, xmm3 psadbw xmm1, xmm3 psadbw xmm2, xmm3%endmacro%macro SAD_X3_1x16P 2 movdqa xmm3, [edi+%1] movdqu xmm4, [eax+%2] movdqu xmm5, [ecx+%2] movdqu xmm6, [edx+%2] psadbw xmm4, xmm3 psadbw xmm5, xmm3 psadbw xmm6, xmm3 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6%endmacro%macro SAD_X3_2x16P 1%if %1 SAD_X3_START_1x16P%else SAD_X3_1x16P 0, 0%endif SAD_X3_1x16P FENC_STRIDE, esi add edi, 2*FENC_STRIDE lea eax, [eax+2*esi] lea ecx, [ecx+2*esi] lea edx, [edx+2*esi]%endmacro%macro SAD_X4_START_1x16P 0 push edi push esi push ebx mov edi, [esp+16] mov eax, [esp+20] mov ebx, [esp+24] mov ecx, [esp+28] mov edx, [esp+32] mov esi, [esp+36] movdqa xmm7, [edi] movdqu xmm0, [eax] movdqu xmm1, [ebx] movdqu xmm2, [ecx] movdqu xmm3, [edx] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 psadbw xmm3, xmm7%endmacro%macro SAD_X4_1x16P 2 movdqa xmm7, [edi+%1] movdqu xmm4, [eax+%2] movdqu xmm5, [ebx+%2] movdqu xmm6, [ecx+%2] psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm4 psadbw xmm6, xmm7 movdqu xmm4, [edx+%2] paddw xmm1, xmm5 psadbw xmm4, xmm7 paddw xmm2, xmm6 paddw xmm3, xmm4%endmacro%macro SAD_X4_2x16P 1%if %1 SAD_X4_START_1x16P%else SAD_X4_1x16P 0, 0%endif SAD_X4_1x16P FENC_STRIDE, esi add edi, 2*FENC_STRIDE lea eax, [eax+2*esi] lea ebx, [ebx+2*esi] lea ecx, [ecx+2*esi] lea edx, [edx+2*esi]%endmacro%macro SAD_X3_END 0 mov eax, [esp+32] pshufd xmm4, xmm0, 2 pshufd xmm5, xmm1, 2 pshufd xmm6, xmm2, 2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 movd [eax+0], xmm0 movd [eax+4], xmm1 movd [eax+8], xmm2 pop esi pop edi ret%endmacro%macro SAD_X4_END 0 mov eax, [esp+40] pshufd xmm4, xmm0, 2 pshufd xmm5, xmm1, 2 pshufd xmm6, xmm2, 2 pshufd xmm7, xmm3, 2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm7 movd [eax+0], xmm0 movd [eax+4], xmm1 movd [eax+8], xmm2 movd [eax+12], xmm3 pop ebx pop esi pop edi ret%endmacroALIGN 16;-----------------------------------------------------------------------------; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,; uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X 3ALIGN 16x264_pixel_sad_x%1_%2x%3_sse2: SAD_X%1_2x%2P 1%rep %3/2-1 SAD_X%1_2x%2P 0%endrep SAD_X%1_END%endmacroSAD_X 3, 16, 16SAD_X 3, 16, 8SAD_X 4, 16, 16SAD_X 4, 16, 8%macro SSD_INC_2x16P_SSE2 0 movdqu xmm1, [eax] movdqu xmm2, [ecx] movdqu xmm3, [eax+ebx] movdqu xmm4, [ecx+edx] movdqa xmm5, xmm1 movdqa xmm6, xmm3 psubusb xmm1, xmm2 psubusb xmm3, xmm4 psubusb xmm2, xmm5 psubusb xmm4, xmm6 por xmm1, xmm2 por xmm3, xmm4 movdqa xmm2, xmm1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -