⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sad_3dne.asm

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 ASM
字号:
;/****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *  - K7 optimized SAD operators -; *; *  Copyright(C) 2002 Jaan Kalda; *; *  This program is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; * $Id: sad_3dne.asm,v 1.6 2004/08/29 10:02:38 edgomez Exp $; *; ***************************************************************************/; these 3dne functions are compatible with iSSE, but are optimized specifically; for K7 pipelinesBITS 32%macro cglobal 1	%ifdef PREFIX		%ifdef MARK_FUNCS			global _%1:function %1.endfunc-%1			%define %1 _%1:function %1.endfunc-%1		%else			global _%1			%define %1 _%1		%endif	%else		%ifdef MARK_FUNCS			global %1:function %1.endfunc-%1		%else			global %1		%endif	%endif%endmacro;=============================================================================; Read only data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 16mmx_one:	times 4	dw 1;=============================================================================; Helper macros;=============================================================================	;; %1 block number (0..4)%macro SAD_16x16_SSE 1  movq mm7, [eax]  movq mm6, [eax+8]  psadbw mm7, [edx]  psadbw mm6, [edx+8]%if (%1)  paddd mm1, mm5%endif  movq mm5, [eax+ecx]  movq mm4, [eax+ecx+8]  psadbw mm5, [edx+ecx]  psadbw mm4, [edx+ecx+8]  movq mm3, [eax+2*ecx]  movq mm2, [eax+2*ecx+8]  psadbw mm3, [edx+2*ecx]  psadbw mm2, [edx+2*ecx+8]%if (%1)  movd [esp+4*(%1-1)], mm1%else  sub esp, byte 12%endif  movq mm1, [eax+ebx]  movq mm0, [eax+ebx+8]  psadbw mm1, [edx+ebx]  psadbw mm0, [edx+ebx+8]  lea eax, [eax+4*ecx]  lea edx, [edx+4*ecx]  paddd mm7, mm6  paddd mm5, mm4  paddd mm3, mm2  paddd mm1, mm0  paddd mm5, mm7  paddd mm1, mm3%endmacro%macro SADBI_16x16_SSE0 0  movq mm2, [edx]  movq mm3, [edx+8]  movq mm5, [byte eax]  movq mm6, [eax+8]  pavgb mm2, [byte ebx]  pavgb mm3, [ebx+8]  add edx, ecx  psadbw mm5, mm2  psadbw mm6, mm3  add eax, ecx  add ebx, ecx  movq mm2, [byte edx]  movq mm3, [edx+8]  movq mm0, [byte eax]  movq mm1, [eax+8]  pavgb mm2, [byte ebx]  pavgb mm3, [ebx+8]  add edx, ecx  add eax, ecx  add ebx, ecx  psadbw mm0, mm2  psadbw mm1, mm3%endmacro%macro SADBI_16x16_SSE 0  movq mm2, [byte edx]  movq mm3, [edx+8]  paddusw mm5, mm0  paddusw mm6, mm1  movq mm0, [eax]  movq mm1, [eax+8]  pavgb mm2, [ebx]  pavgb mm3, [ebx+8]  add edx, ecx  add eax, ecx  add ebx, ecx  psadbw mm0, mm2  psadbw mm1, mm3%endmacro%macro SADBI_8x8_3dne 0  movq mm2, [edx]  movq mm3, [edx+ecx]  pavgb mm2, [eax]  pavgb mm3, [eax+ecx]  lea edx, [edx+2*ecx]  lea eax, [eax+2*ecx]  paddusw mm5, mm0  paddusw mm6, mm1  movq mm0, [ebx]  movq mm1, [ebx+ecx]  lea ebx, [ebx+2*ecx]  psadbw mm0, mm2  psadbw mm1, mm3%endmacro%macro ABS_16x16_SSE 1%if (%1 == 0)  movq mm7, [eax]  psadbw mm7, mm4  mov esi, esi  movq mm6, [eax+8]  movq mm5, [eax+ecx]  movq mm3, [eax+ecx+8]  psadbw mm6, mm4  movq mm2, [byte eax+2*ecx]  psadbw mm5, mm4  movq mm1, [eax+2*ecx+8]  psadbw mm3, mm4  movq mm0, [dword eax+edx]  psadbw mm2, mm4  add eax, edx  psadbw mm1, mm4%endif%if (%1 == 1)  psadbw mm0, mm4  paddd mm7, mm0  movq mm0, [eax+8]  psadbw mm0, mm4  paddd mm6, mm0  movq mm0, [byte eax+ecx]  psadbw mm0, mm4  paddd mm5, mm0  movq mm0, [eax+ecx+8]  psadbw mm0, mm4  paddd mm3, mm0  movq mm0, [eax+2*ecx]  psadbw mm0, mm4  paddd mm2, mm0  movq mm0, [eax+2*ecx+8]  add eax, edx  psadbw mm0, mm4  paddd mm1, mm0  movq mm0, [eax]%endif%if (%1 == 2)  psadbw mm0, mm4  paddd mm7, mm0  movq mm0, [eax+8]  psadbw mm0, mm4  paddd mm6, mm0%endif%endmacro;=============================================================================; Code;=============================================================================SECTION .textcglobal sad16_3dnecglobal sad8_3dnecglobal sad16bi_3dnecglobal sad8bi_3dnecglobal dev16_3dne;-----------------------------------------------------------------------------;; uint32_t sad16_3dne(const uint8_t * const cur,;                     const uint8_t * const ref,;                     const uint32_t stride,;                     const uint32_t best_sad);;;-----------------------------------------------------------------------------; optimization: 21% fasterALIGN 16sad16_3dne:  mov eax, [esp+ 4] ; Src1  mov edx, [esp+ 8] ; Src2  mov ecx, [esp+12] ; Stride  push ebx  lea ebx, [2*ecx+ecx]  SAD_16x16_SSE 0  SAD_16x16_SSE 1  SAD_16x16_SSE 2  SAD_16x16_SSE 3  mov ecx, [esp]  add ecx, [esp+4]  add ecx, [esp+8]  paddd mm1, mm5  mov ebx, [esp+12]  add esp, byte 4+12  movd eax, mm1  add eax, ecx  ret.endfunc;-----------------------------------------------------------------------------;; uint32_t sad8_3dne(const uint8_t * const cur,;					const uint8_t * const ref,;					const uint32_t stride);;;-----------------------------------------------------------------------------ALIGN 16sad8_3dne:  mov eax, [esp+ 4] ; Src1  mov ecx, [esp+12] ; Stride  mov edx, [esp+ 8] ; Src2  push ebx  lea ebx, [ecx+2*ecx]  movq mm0, [byte eax]      ;0  psadbw mm0, [byte edx]  movq mm1, [eax+ecx]       ;1  psadbw mm1, [edx+ecx]  movq mm2, [eax+2*ecx]     ;2  psadbw mm2, [edx+2*ecx]  movq mm3, [eax+ebx]       ;3  psadbw mm3, [edx+ebx]  paddd mm0, mm1  movq mm4, [byte eax+4*ecx];4  psadbw mm4, [edx+4*ecx]  movq mm5, [eax+2*ebx]     ;6  psadbw mm5, [edx+2*ebx]  paddd mm2, mm3  paddd mm0, mm2  lea ebx, [ebx+4*ecx]      ;3+4=7  lea ecx, [ecx+4*ecx]      ; 5  movq mm6, [eax+ecx]       ;5  psadbw mm6, [edx+ecx]  movq mm7, [eax+ebx]       ;7  psadbw mm7, [edx+ebx]  paddd mm4, mm5  paddd mm6, mm7  paddd mm0, mm4  mov ebx, [esp]  add esp, byte 4  paddd mm0, mm6  movd eax, mm0 ret.endfunc;-----------------------------------------------------------------------------;; uint32_t sad16bi_3dne(const uint8_t * const cur,;					const uint8_t * const ref1,;					const uint8_t * const ref2,;					const uint32_t stride);;;-----------------------------------------------------------------------------;optimization: 14% fasterALIGN 16sad16bi_3dne:  mov eax, [esp+ 4] ; Src  mov edx, [esp+ 8] ; Ref1  mov ecx, [esp+16] ; Stride  push ebx  mov ebx, [esp+4+12] ; Ref2  SADBI_16x16_SSE0  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  SADBI_16x16_SSE  paddusw mm5,mm0  paddusw mm6,mm1  pop ebx  paddusw mm6,mm5  movd eax, mm6  ret.endfunc;-----------------------------------------------------------------------------;; uint32_t sad8bi_3dne(const uint8_t * const cur,; const uint8_t * const ref1,; const uint8_t * const ref2,; const uint32_t stride);;;-----------------------------------------------------------------------------ALIGN 16sad8bi_3dne:  mov eax, [esp+12] ; Ref2  mov edx, [esp+ 8] ; Ref1  mov ecx, [esp+16] ; Stride  push ebx  mov ebx, [esp+4+ 4] ; Src  movq mm2, [edx]  movq mm3, [edx+ecx]  pavgb mm2, [eax]  pavgb mm3, [eax+ecx]  lea edx, [edx+2*ecx]  lea eax, [eax+2*ecx]  movq mm5, [ebx]  movq mm6, [ebx+ecx]  lea ebx, [ebx+2*ecx]  psadbw mm5, mm2  psadbw mm6, mm3  movq mm2, [edx]  movq mm3, [edx+ecx]  pavgb mm2, [eax]  pavgb mm3, [eax+ecx]  lea edx, [edx+2*ecx]  lea eax, [eax+2*ecx]  movq mm0, [ebx]  movq mm1, [ebx+ecx]  lea ebx, [ebx+2*ecx]  psadbw mm0, mm2  psadbw mm1, mm3  movq mm2, [edx]  movq mm3, [edx+ecx]  pavgb mm2, [eax]  pavgb mm3, [eax+ecx]  lea edx, [edx+2*ecx]  lea eax, [eax+2*ecx]  paddusw mm5,mm0  paddusw mm6,mm1  movq mm0, [ebx]  movq mm1, [ebx+ecx]  lea ebx, [ebx+2*ecx]  psadbw mm0, mm2  psadbw mm1, mm3  movq mm2, [edx]  movq mm3, [edx+ecx]  pavgb mm2, [eax]  pavgb mm3, [eax+ecx]  paddusw mm5,mm0  paddusw mm6,mm1  movq mm0, [ebx]  movq mm1, [ebx+ecx]  psadbw mm0, mm2  psadbw mm1, mm3  paddusw mm5,mm0  paddusw mm6,mm1  paddusw mm6,mm5  mov ebx,[esp]  add esp,byte 4  movd eax, mm6 ret.endfunc;===========================================================================;; uint32_t dev16_3dne(const uint8_t * const cur,;					const uint32_t stride);;;===========================================================================; optimization: 25 % fasterALIGN 16dev16_3dne:  mov eax, [esp+ 4] ; Src  mov ecx, [esp+ 8] ; Stride  lea edx, [ecx+2*ecx]  pxor mm4, mm4ALIGN 8  ABS_16x16_SSE 0  ABS_16x16_SSE 1  ABS_16x16_SSE 1  ABS_16x16_SSE 1  ABS_16x16_SSE 1  paddd mm1, mm2  paddd mm3, mm5  ABS_16x16_SSE 2  paddd mm7, mm6  paddd mm1, mm3  mov eax, [esp+ 4]         ; Src  paddd mm7, mm1  punpcklbw mm7, mm7        ;xxyyaazz  pshufw mm4, mm7, 055h     ; mm4 contains the mean  pxor mm1, mm1  ABS_16x16_SSE 0  ABS_16x16_SSE 1  ABS_16x16_SSE 1  ABS_16x16_SSE 1  ABS_16x16_SSE 1  paddd mm1, mm2  paddd mm3, mm5  ABS_16x16_SSE 2  paddd mm7, mm6  paddd mm1, mm3  paddd mm7, mm1  movd eax, mm7  ret.endfunc

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -