📄 sad_mmx.asm
字号:
;===========================================================================
;
; uint32_t sad8_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride);
;
; experimental!
;
;===========================================================================
align 16
cglobal sad8_xmm
sad8_xmm
push esi
push edi
mov esi, [esp + 8 + 4] ; ref
mov edi, [esp + 8 + 8] ; cur
mov ecx, [esp + 8 + 12] ; stride
mov edx, ecx
shl edx, 1
; mov eax, 4
pxor mm6, mm6 ; mm6 = sum = 0
;.loop
movq mm0, [esi] ; ref
movq mm2, [esi+ecx] ; ref2
psadbw mm0, [edi] ; mm0 = |ref - cur|
psadbw mm2, [edi+ecx] ; mm0 = |ref2 - cur2|
paddusw mm6,mm0 ; sum += mm01
paddusw mm6,mm2 ; sum += mm23
add esi, edx
add edi, edx
; dec eax
; jnz .loop
movq mm0, [esi]
movq mm2, [esi+ecx]
psadbw mm0, [edi]
psadbw mm2, [edi+ecx]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, edx
add edi, edx
movq mm0, [esi]
movq mm2, [esi+ecx]
psadbw mm0, [edi]
psadbw mm2, [edi+ecx]
paddusw mm6,mm0
paddusw mm6,mm2
add esi, edx
add edi, edx
movq mm0, [esi]
movq mm2, [esi+ecx]
psadbw mm0, [edi]
psadbw mm2, [edi+ecx]
paddusw mm6,mm0
paddusw mm6,mm2
movd eax, mm6
pop edi
pop esi
ret
;===========================================================================
;
; uint32_t dev16_mmx(const uint8_t * const cur,
; const uint32_t stride);
;
;===========================================================================
align 16
cglobal dev16_mmx
dev16_mmx
push esi
push edi
pxor mm4, mm4 ; mm23 = sum = 0
pxor mm5, mm5
mov esi, [esp + 8 + 4] ; cur
mov ecx, [esp + 8 + 8] ; stride
mov edi, esi
mov eax, 16
pxor mm7, mm7 ; mm7 = 0
.loop1
movq mm0, [esi]
movq mm2, [esi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpcklbw mm2, mm7
punpckhbw mm1, mm7
punpckhbw mm3, mm7
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm0
paddw mm5, mm2
add esi, ecx
dec eax
jnz .loop1
paddusw mm4, mm5
pmaddwd mm4, [mmx_one] ; merge sum
movq mm5, mm4
psrlq mm5, 32
paddd mm4, mm5
psllq mm4, 32 ; blank upper dword
psrlq mm4, 32 + 8 ; mm4 /= (16*16)
punpckldq mm4, mm4
packssdw mm4, mm4 ; mm4 = mean
pxor mm6, mm6 ; mm6 = dev = 0
mov eax, 16
.loop2
movq mm0, [edi]
movq mm2, [edi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpcklbw mm2, mm7
punpckhbw mm1, mm7 ; mm01 = cur
punpckhbw mm3, mm7 ; mm23 = cur2
movq mm5, mm4 ;
psubusw mm5, mm0 ;
psubusw mm0, mm4 ;
por mm0, mm5 ;
movq mm5, mm4 ;
psubusw mm5, mm1 ;
psubusw mm1, mm4 ;
por mm1, mm5 ; mm01 = |mm01 - mm4|
movq mm5, mm4 ;
psubusw mm5, mm2 ;
psubusw mm2, mm4 ;
por mm2, mm5 ;
movq mm5, mm4 ;
psubusw mm5, mm3 ;
psubusw mm3, mm4 ;
por mm3, mm5 ; mm23 = |mm23 - mm4|
paddw mm0, mm1
paddw mm2, mm3
paddw mm6, mm0
paddw mm6, mm2 ; dev += mm01 + mm23
add edi, ecx
dec eax
jnz .loop2
pmaddwd mm6, [mmx_one] ; merge dev
movq mm7, mm6
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
pop edi
pop esi
ret
;===========================================================================
;
; uint32_t dev16_xmm(const uint8_t * const cur,
; const uint32_t stride);
;
; experimental!
;
;===========================================================================
align 16
cglobal dev16_xmm
dev16_xmm
push esi
push edi
pxor mm4, mm4 ; mm23 = sum = 0
mov esi, [esp + 8 + 4] ; cur
mov ecx, [esp + 8 + 8] ; stride
mov edi, esi
; mov eax, 16
pxor mm7, mm7 ; mm7 = 0
;.loop1
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7 ; abs(cur0 - 0) + abs(cur1 - 0) + ... + abs(cur7 - 0) -> mm0
psadbw mm2, mm7 ; abs(cur8 - 0) + abs(cur9 - 0) + ... + abs(cur15 - 0) -> mm2
paddw mm4,mm0 ; mean += mm0
paddw mm4,mm2 ; mean += mm2
add esi, ecx
; dec eax
; jnz .loop1
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
add esi, ecx
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm7
psadbw mm2, mm7
paddw mm4,mm0
paddw mm4,mm2
movq mm5, mm4
psllq mm5, 32
paddd mm4, mm5
psrld mm4, 8
packssdw mm4, mm4
packuswb mm4, mm4
pxor mm6, mm6 ; mm6 = dev = 0
; mov eax, 16
;.loop2
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4 ; mm0 = |cur - mean|
psadbw mm2, mm4 ; mm0 = |cur2 - mean|
paddw mm6,mm0 ; dev += mm01
paddw mm6,mm2 ; dev += mm23
add edi, ecx
; dec eax
; jnz .loop2
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
add edi, ecx
movq mm0, [edi]
movq mm2, [edi + 8]
psadbw mm0, mm4
psadbw mm2, mm4
paddw mm6,mm0
paddw mm6,mm2
movq mm7, mm6
psllq mm7, 32
paddd mm6, mm7
movd eax, mm6
pop edi
pop esi
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -