📄 sad_mmx.asm
字号:
;/**************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * mmx/xmm sum of absolute difference
; *
; * This program is an implementation of a part of one or more MPEG-4
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
; * to use this software module in hardware or software products are
; * advised that its use may infringe existing patents or copyrights, and
; * any such use would be at such party's own risk. The original
; * developer of this software module and his/her company, and subsequent
; * editors and their companies, will have no liability for use of this
; * software or modifications or derivatives thereof.
; *
; * This program is free software; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; *************************************************************************/
;/**************************************************************************
; *
; * History:
; *
; * 17.11.2001 bugfix and small improvement for dev16_xmm,
; * removed terminate early in sad16_xmm
; * 12.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
; *
; *************************************************************************/
bits 32
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
section .data
align 16
mmx_one times 4 dw 1
section .text
;===========================================================================
;
; uint32_t sad16_mmx(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride,
; const uint32_t best_sad);
;
; (early termination ignore; slows this down)
;
;===========================================================================
align 16
cglobal sad16_mmx
sad16_mmx
push esi
push edi
mov esi, [esp + 8 + 4] ; ref
mov edi, [esp + 8 + 8] ; cur
mov ecx, [esp + 8 + 12] ; stride
mov edx, 16
pxor mm6, mm6 ; mm6 = sum = 0
pxor mm7, mm7 ; mm7 = 0
.loop
movq mm0, [esi] ; ref
movq mm1, [edi] ; cur
movq mm2, [esi+8] ; ref2
movq mm3, [edi+8] ; cur2
movq mm4, mm0
movq mm5, mm2
psubusb mm0, mm1
psubusb mm2, mm3
psubusb mm1, mm4
psubusb mm3, mm5
por mm0, mm1 ; mm0 = |ref - cur|
por mm2, mm3 ; mm2 = |ref2 - cur2|
movq mm1,mm0
movq mm3,mm2
punpcklbw mm0,mm7
punpcklbw mm2,mm7
punpckhbw mm1,mm7
punpckhbw mm3,mm7
paddusw mm0,mm1
paddusw mm2,mm3
paddusw mm6,mm0 ; sum += mm01
paddusw mm6,mm2 ; sum += mm23
add esi, ecx
add edi, ecx
dec edx
jnz .loop
pmaddwd mm6, [mmx_one] ; merge sum
movq mm7, mm6
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
pop edi
pop esi
ret
;===========================================================================
;
; uint32_t sad16_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride,
; const uint32_t best_sad);
;
; experimental!
;
;===========================================================================
align 16
cglobal sad16_xmm
sad16_xmm
push esi
push edi
push ebx
mov esi, [esp + 12 + 4] ; ref
mov edi, [esp + 12 + 8] ; cur
mov ecx, [esp + 12 + 12] ; stride
mov ebx, [esp + 12 + 16] ; best_sad
; mov edx, 16
pxor mm6, mm6 ; mm6 = sum = 0
;.loop
movq mm0, [esi] ; ref
movq mm2, [esi+8] ; ref2
psadbw mm0, [edi] ; mm0 = |ref - cur|
psadbw mm2, [edi+8] ; mm0 = |ref2 - cur2|
paddusw mm6,mm0 ; sum += mm01
paddusw mm6,mm2 ; sum += mm23
add esi, ecx
add edi, ecx
; dec edx
; jnz .loop
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, ecx
add edi, ecx
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, [edi]
psadbw mm2, [edi+8]
paddusw mm6,mm0
paddusw mm6,mm2
movd eax, mm6
.ret pop ebx
pop edi
pop esi
ret
;===========================================================================
;
; uint32_t sad16_sse2(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride,
; const uint32_t best_sad);
;
;===========================================================================
align 16
cglobal sad16_sse2
sad16_sse2
push esi
push edi
push ebx
mov esi, [esp + 12 + 4] ; ref
mov edi, [esp + 12 + 8] ; cur
mov ecx, [esp + 12 + 12] ; stride
mov ebx, [esp + 12 + 16] ; best_sad
mov edx, 16
pxor xmm2, xmm2 ; xmm2 = sum = 0
; 0
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 1
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 2
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 3
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 4
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 5
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 6
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 7
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 8
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 9
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 10
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 11
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 12
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 13
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 14
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
add esi, ecx
add edi, ecx
; 15
movdqu xmm0, [esi] ; ref
movdqu xmm1, [edi] ; cur
psadbw xmm0, xmm1 ; xmm0 = |ref - cur|
paddusw xmm2,xmm0 ; sum += xmm0
movd eax, xmm2
psrldq xmm2, 8
movd ecx, xmm2
add eax, ecx
.ret pop ebx
pop edi
pop esi
ret
;===========================================================================
;
; uint32_t sad8_mmx(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride);
;
;===========================================================================
align 16
cglobal sad8_mmx
sad8_mmx
push esi
push edi
mov esi, [esp + 8 + 4] ; ref
mov edi, [esp + 8 + 8] ; cur
mov ecx, [esp + 8 + 12] ; stride
mov eax, 4
pxor mm6, mm6 ; mm6 = sum = 0
pxor mm7, mm7 ; mm7 = 0
.loop
movq mm0, [esi] ; ref
movq mm1, [edi] ; cur
movq mm2, [esi+ecx] ; ref2
movq mm3, [edi+ecx] ; cur2
movq mm4, mm0
movq mm5, mm2
psubusb mm0, mm1
psubusb mm2, mm3
psubusb mm1, mm4
psubusb mm3, mm5
por mm0, mm1 ; mm0 = |ref - cur|
por mm2, mm3 ; mm2 = |ref2 - cur2|
movq mm1,mm0
movq mm3,mm2
punpcklbw mm0,mm7
punpcklbw mm2,mm7
punpckhbw mm1,mm7
punpckhbw mm3,mm7
paddusw mm0,mm1
paddusw mm2,mm3
paddusw mm6,mm0 ; sum += mm01
paddusw mm6,mm2 ; sum += mm23
add esi, ecx
add edi, ecx
add esi, ecx
add edi, ecx
dec eax
jnz .loop
pmaddwd mm6, [mmx_one] ; merge sum
movq mm7, mm6
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
pop edi
pop esi
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -