📄 sad_sse2.asm
字号:
;/****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - SSE2 optimized SAD operators -; *; * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>; *; *; * This program is free software; you can redistribute it and/or modify it; * under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: sad_sse2.asm,v 1.12 2004/08/29 10:02:38 edgomez Exp $; *; ***************************************************************************/BITS 32%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Read only data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 64zero times 4 dd 0;=============================================================================; Code;=============================================================================SECTION .textcglobal sad16_sse2cglobal dev16_sse2;-----------------------------------------------------------------------------; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned!; const uint8_t * const ref,; const uint32_t stride,; const uint32_t /*ignored*/);;-----------------------------------------------------------------------------%macro SAD_16x16_SSE2 0 movdqu xmm0, [edx] movdqu xmm1, [edx+ecx] lea edx,[edx+2*ecx] movdqa xmm2, [eax] movdqa xmm3, [eax+ecx] lea eax,[eax+2*ecx] psadbw xmm0, xmm2 paddusw xmm6,xmm0 psadbw xmm1, xmm3 paddusw xmm6,xmm1%endmacroALIGN 16sad16_sse2: mov eax, [esp+ 4] ; cur (assumed aligned) mov edx, [esp+ 8] ; ref mov ecx, [esp+12] ; stride pxor xmm6, xmm6 ; accum SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 SAD_16x16_SSE2 pshufd xmm5, xmm6, 00000010b paddusw xmm6, xmm5 pextrw eax, xmm6, 0 ret.endfunc;-----------------------------------------------------------------------------; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);;-----------------------------------------------------------------------------%macro MEAN_16x16_SSE2 0 ; eax: src, ecx:stride, mm7: zero or mean => mm6: result movdqu xmm0, [eax] movdqu xmm1, [eax+ecx] lea eax, [eax+2*ecx] ; + 2*stride psadbw xmm0, xmm7 paddusw xmm6, xmm0 psadbw xmm1, xmm7 paddusw xmm6, xmm1%endmacroALIGN 16dev16_sse2: mov eax, [esp+ 4] ; src mov ecx, [esp+ 8] ; stride pxor xmm6, xmm6 ; accum pxor xmm7, xmm7 ; zero MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 mov eax, [esp+ 4] ; src again pshufd xmm7, xmm6, 10b paddusw xmm7, xmm6 pxor xmm6, xmm6 ; zero accum psrlw xmm7, 8 ; => Mean pshuflw xmm7, xmm7, 0 ; replicate Mean packuswb xmm7, xmm7 pshufd xmm7, xmm7, 00000000b MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 MEAN_16x16_SSE2 pshufd xmm7, xmm6, 10b paddusw xmm7, xmm6 pextrw eax, xmm7, 0 ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -