📄 pixel-sse2.asm

📁 DM642 H.264 codec DM642 H.264 codec DM642 H.264 codec DM642 H.264 codec
💻 ASM
📖 第 1 页 / 共 2 页
字号:
12 下一页
;*****************************************************************************;* pixel-sse2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* Authors: Alex Izvorski <aizvorksi@gmail.com>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.;*****************************************************************************BITS 32;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "i386inc.asm"SECTION_RODATApd_0000ffff: times 4 dd 0x0000ffffSECTION .textcglobal x264_pixel_sad_16x16_sse2cglobal x264_pixel_sad_16x8_sse2cglobal x264_pixel_sad_x3_16x16_sse2cglobal x264_pixel_sad_x3_16x8_sse2cglobal x264_pixel_sad_x4_16x16_sse2cglobal x264_pixel_sad_x4_16x8_sse2cglobal x264_pixel_ssd_16x16_sse2cglobal x264_pixel_ssd_16x8_sse2cglobal x264_pixel_satd_8x4_sse2cglobal x264_pixel_satd_8x8_sse2cglobal x264_pixel_satd_16x8_sse2cglobal x264_pixel_satd_8x16_sse2cglobal x264_pixel_satd_16x16_sse2%macro SAD_INC_4x16P_SSE2 0    movdqu  xmm1,   [ecx]    movdqu  xmm2,   [ecx+edx]    lea     ecx,    [ecx+2*edx]    movdqu  xmm3,   [ecx]    movdqu  xmm4,   [ecx+edx]    psadbw  xmm1,   [eax]    psadbw  xmm2,   [eax+ebx]    lea     eax,    [eax+2*ebx]    psadbw  xmm3,   [eax]    psadbw  xmm4,   [eax+ebx]    lea     eax,    [eax+2*ebx]    lea     ecx,    [ecx+2*edx]    paddw   xmm1,   xmm2    paddw   xmm3,   xmm4    paddw   xmm0,   xmm1    paddw   xmm0,   xmm3%endmacro%macro SAD_START_SSE2 0    push    ebx    mov     eax,    [esp+ 8]    ; pix1    mov     ebx,    [esp+12]    ; stride1    mov     ecx,    [esp+16]    ; pix2    mov     edx,    [esp+20]    ; stride2%endmacro%macro SAD_END_SSE2 0    movdqa  xmm1, xmm0    psrldq  xmm0,  8    paddw   xmm0, xmm1    movd    eax,  xmm0    pop ebx    ret%endmacroALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_sad_16x16_sse2:    SAD_START_SSE2    movdqu xmm0, [ecx]    movdqu xmm1, [ecx+edx]    lea    ecx,  [ecx+2*edx]    movdqu xmm2, [ecx]    movdqu xmm3, [ecx+edx]    lea    ecx,  [ecx+2*edx]    psadbw xmm0, [eax]    psadbw xmm1, [eax+ebx]    lea    eax,  [eax+2*ebx]    movdqu xmm4, [ecx]    paddw  xmm0, xmm1    psadbw xmm2, [eax]    psadbw xmm3, [eax+ebx]    lea    eax,  [eax+2*ebx]    movdqu xmm5, [ecx+edx]    lea    ecx,  [ecx+2*edx]    paddw  xmm2, xmm3    movdqu xmm6, [ecx]    movdqu xmm7, [ecx+edx]    lea    ecx,  [ecx+2*edx]    paddw  xmm0, xmm2    psadbw xmm4, [eax]    psadbw xmm5, [eax+ebx]    lea    eax,  [eax+2*ebx]    movdqu xmm1, [ecx]    paddw  xmm4, xmm5    psadbw xmm6, [eax]    psadbw xmm7, [eax+ebx]    lea    eax,  [eax+2*ebx]    movdqu xmm2, [ecx+edx]    lea    ecx,  [ecx+2*edx]    paddw  xmm6, xmm7    movdqu xmm3, [ecx]    paddw  xmm0, xmm4    movdqu xmm4, [ecx+edx]    lea    ecx,  [ecx+2*edx]    paddw  xmm0, xmm6    psadbw xmm1, [eax]    psadbw xmm2, [eax+ebx]    lea    eax,  [eax+2*ebx]    movdqu xmm5, [ecx]    paddw  xmm1, xmm2    psadbw xmm3, [eax]    psadbw xmm4, [eax+ebx]    lea    eax,  [eax+2*ebx]    movdqu xmm6, [ecx+edx]    lea    ecx,  [ecx+2*edx]    paddw  xmm3, xmm4    movdqu xmm7, [ecx]    paddw  xmm0, xmm1    movdqu xmm1, [ecx+edx]    paddw  xmm0, xmm3    psadbw xmm5, [eax]    psadbw xmm6, [eax+ebx]    lea    eax,  [eax+2*ebx]    paddw  xmm5, xmm6    psadbw xmm7, [eax]    psadbw xmm1, [eax+ebx]    paddw  xmm7, xmm1    paddw  xmm0, xmm5    paddw  xmm0, xmm7    SAD_END_SSE2ALIGN 16;-----------------------------------------------------------------------------;   int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_sad_16x8_sse2:    SAD_START_SSE2    pxor    xmm0,   xmm0    SAD_INC_4x16P_SSE2    SAD_INC_4x16P_SSE2    SAD_END_SSE2%macro SAD_X3_START_1x16P 0    push    edi    push    esi    mov     edi,    [esp+12]    mov     eax,    [esp+16]    mov     ecx,    [esp+20]    mov     edx,    [esp+24]    mov     esi,    [esp+28]    movdqa  xmm3,   [edi]    movdqu  xmm0,   [eax]    movdqu  xmm1,   [ecx]    movdqu  xmm2,   [edx]    psadbw  xmm0,   xmm3    psadbw  xmm1,   xmm3    psadbw  xmm2,   xmm3%endmacro%macro SAD_X3_1x16P 2    movdqa  xmm3,   [edi+%1]    movdqu  xmm4,   [eax+%2]    movdqu  xmm5,   [ecx+%2]    movdqu  xmm6,   [edx+%2]    psadbw  xmm4,   xmm3    psadbw  xmm5,   xmm3    psadbw  xmm6,   xmm3    paddw   xmm0,   xmm4    paddw   xmm1,   xmm5    paddw   xmm2,   xmm6%endmacro%macro SAD_X3_2x16P 1%if %1    SAD_X3_START_1x16P%else    SAD_X3_1x16P 0, 0%endif    SAD_X3_1x16P FENC_STRIDE, esi    add     edi, 2*FENC_STRIDE    lea     eax, [eax+2*esi]    lea     ecx, [ecx+2*esi]    lea     edx, [edx+2*esi]%endmacro%macro SAD_X4_START_1x16P 0    push    edi    push    esi    push    ebx    mov     edi,    [esp+16]    mov     eax,    [esp+20]    mov     ebx,    [esp+24]    mov     ecx,    [esp+28]    mov     edx,    [esp+32]    mov     esi,    [esp+36]    movdqa  xmm7,   [edi]    movdqu  xmm0,   [eax]    movdqu  xmm1,   [ebx]    movdqu  xmm2,   [ecx]    movdqu  xmm3,   [edx]    psadbw  xmm0,   xmm7    psadbw  xmm1,   xmm7    psadbw  xmm2,   xmm7    psadbw  xmm3,   xmm7%endmacro%macro SAD_X4_1x16P 2    movdqa  xmm7,   [edi+%1]    movdqu  xmm4,   [eax+%2]    movdqu  xmm5,   [ebx+%2]    movdqu  xmm6,   [ecx+%2]    psadbw  xmm4,   xmm7    psadbw  xmm5,   xmm7    paddw   xmm0,   xmm4    psadbw  xmm6,   xmm7    movdqu  xmm4,   [edx+%2]    paddw   xmm1,   xmm5    psadbw  xmm4,   xmm7    paddw   xmm2,   xmm6    paddw   xmm3,   xmm4%endmacro%macro SAD_X4_2x16P 1%if %1    SAD_X4_START_1x16P%else    SAD_X4_1x16P 0, 0%endif    SAD_X4_1x16P FENC_STRIDE, esi    add     edi, 2*FENC_STRIDE    lea     eax, [eax+2*esi]    lea     ebx, [ebx+2*esi]    lea     ecx, [ecx+2*esi]    lea     edx, [edx+2*esi]%endmacro%macro SAD_X3_END 0    mov     eax,  [esp+32]    pshufd  xmm4, xmm0, 2    pshufd  xmm5, xmm1, 2    pshufd  xmm6, xmm2, 2    paddw   xmm0, xmm4    paddw   xmm1, xmm5    paddw   xmm2, xmm6    movd    [eax+0], xmm0    movd    [eax+4], xmm1    movd    [eax+8], xmm2    pop     esi    pop     edi    ret%endmacro%macro SAD_X4_END 0    mov     eax,  [esp+40]    pshufd  xmm4, xmm0, 2    pshufd  xmm5, xmm1, 2    pshufd  xmm6, xmm2, 2    pshufd  xmm7, xmm3, 2    paddw   xmm0, xmm4    paddw   xmm1, xmm5    paddw   xmm2, xmm6    paddw   xmm3, xmm7    movd    [eax+0], xmm0    movd    [eax+4], xmm1    movd    [eax+8], xmm2    movd    [eax+12], xmm3    pop     ebx    pop     esi    pop     edi    ret%endmacroALIGN 16;-----------------------------------------------------------------------------;  void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,;                                     uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X 3ALIGN 16x264_pixel_sad_x%1_%2x%3_sse2:    SAD_X%1_2x%2P 1%rep %3/2-1    SAD_X%1_2x%2P 0%endrep    SAD_X%1_END%endmacroSAD_X 3, 16, 16SAD_X 3, 16,  8SAD_X 4, 16, 16SAD_X 4, 16,  8%macro SSD_INC_2x16P_SSE2 0    movdqu  xmm1,   [eax]    movdqu  xmm2,   [ecx]    movdqu  xmm3,   [eax+ebx]    movdqu  xmm4,   [ecx+edx]    movdqa  xmm5,   xmm1    movdqa  xmm6,   xmm3    psubusb xmm1,   xmm2    psubusb xmm3,   xmm4    psubusb xmm2,   xmm5    psubusb xmm4,   xmm6    por     xmm1,   xmm2    por     xmm3,   xmm4    movdqa  xmm2,   xmm1
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -