📄 sad_mmx.asm
字号:
;/****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - K7 optimized SAD operators -; *; * Copyright(C) 2001 Peter Ross <pross@xvid.org>; * 2002 Pascal Massimino <skal@planet-d.net>; * 2004 Andre Werthmann <wertmann@aei.mpg.de>; *; * This program is free software; you can redistribute it and/or modify it; * under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: sad_mmx.asm,v 1.1 2005/01/05 23:02:15 edgomez Exp $; *; ***************************************************************************/BITS 64%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Read only data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endif;=============================================================================; Code;=============================================================================SECTION .text align=16cglobal sse8_16bit_x86_64cglobal sse8_8bit_x86_64;-----------------------------------------------------------------------------;; uint32_t sse8_16bit_x86_64x(const int16_t *b1,; const int16_t *b2,; const uint32_t stride);;;-----------------------------------------------------------------------------%macro ROW_SSE_16Bit_MMX 2 movq mm0, [%1] movq mm1, [%1+8] psubw mm0, [%2] psubw mm1, [%2+8] pmaddwd mm0, mm0 pmaddwd mm1, mm1 paddd mm2, mm0 paddd mm2, mm1%endmacro sse8_16bit_x86_64: ; rdx is stride ; rsi is b2 ; rdi is b1 ;; Reset the sse accumulator pxor mm2, mm2 ;; Let's go%rep 8 ROW_SSE_16Bit_MMX rsi, rdi lea rsi, [rsi+rdx] lea rdi, [rdi+rdx]%endrep ;; Finish adding each dword of the accumulator movq mm3, mm2 psrlq mm2, 32 paddd mm2, mm3 movd eax, mm2 ;; All done ret.endfunc ;-----------------------------------------------------------------------------;; uint32_t sse8_8bit_x86_64(const int8_t *b1,; const int8_t *b2,; const uint32_t stride);;;-----------------------------------------------------------------------------%macro ROW_SSE_8bit_MMX 2 movq mm0, [%1] ; load a row movq mm2, [%2] ; load a row movq mm1, mm0 ; copy row movq mm3, mm2 ; copy row punpcklbw mm0, mm7 ; turn the 4low elements into 16bit punpckhbw mm1, mm7 ; turn the 4high elements into 16bit punpcklbw mm2, mm7 ; turn the 4low elements into 16bit punpckhbw mm3, mm7 ; turn the 4high elements into 16bit psubw mm0, mm2 ; low part of src-dst psubw mm1, mm3 ; high part of src-dst pmaddwd mm0, mm0 ; compute the square sum pmaddwd mm1, mm1 ; compute the square sum paddd mm6, mm0 ; add to the accumulator paddd mm6, mm1 ; add to the accumulator%endmacrosse8_8bit_x86_64: ;; Reset the sse accumulator pxor mm6, mm6 ;; Used to interleave 8bit data with 0x00 values pxor mm7, mm7 ;; Let's go%rep 8 ROW_SSE_8bit_MMX rsi, rdi lea rsi, [rsi+rdx] lea rdi, [rdi+rdx]%endrep ;; Finish adding each dword of the accumulator movq mm7, mm6 psrlq mm6, 32 paddd mm6, mm7 movd eax, mm6 ;; All done ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -