📄 sad_mmx.asm
字号:
;===========================================================================;; uint32_t sad8_xmm(const uint8_t * const cur,; const uint8_t * const ref,; const uint32_t stride);;; experimental!;;===========================================================================align 16cglobal sad8_xmmsad8_xmm push esi push edi mov esi, [esp + 8 + 4] ; ref mov edi, [esp + 8 + 8] ; cur mov ecx, [esp + 8 + 12] ; stride mov edx, ecx shl edx, 1; mov eax, 4 pxor mm6, mm6 ; mm6 = sum = 0;.loop movq mm0, [esi] ; ref movq mm2, [esi+ecx] ; ref2 psadbw mm0, [edi] ; mm0 = |ref - cur| psadbw mm2, [edi+ecx] ; mm0 = |ref2 - cur2| paddusw mm6,mm0 ; sum += mm01 paddusw mm6,mm2 ; sum += mm23 add esi, edx add edi, edx; dec eax; jnz .loop movq mm0, [esi] movq mm2, [esi+ecx] psadbw mm0, [edi] psadbw mm2, [edi+ecx] paddusw mm6,mm0 paddusw mm6,mm2 add esi, edx add edi, edx movq mm0, [esi] movq mm2, [esi+ecx] psadbw mm0, [edi] psadbw mm2, [edi+ecx] paddusw mm6,mm0 paddusw mm6,mm2 add esi, edx add edi, edx movq mm0, [esi] movq mm2, [esi+ecx] psadbw mm0, [edi] psadbw mm2, [edi+ecx] paddusw mm6,mm0 paddusw mm6,mm2 movd eax, mm6 pop edi pop esi ret;===========================================================================;; uint32_t dev16_mmx(const uint8_t * const cur,; const uint32_t stride);;;===========================================================================align 16cglobal dev16_mmxdev16_mmx push esi push edi pxor mm4, mm4 ; mm23 = sum = 0 pxor mm5, mm5 mov esi, [esp + 8 + 4] ; cur mov ecx, [esp + 8 + 8] ; stride mov edi, esi mov eax, 16 pxor mm7, mm7 ; mm7 = 0.loop1 movq mm0, [esi] movq mm2, [esi + 8] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7 punpckhbw mm1, mm7 punpckhbw mm3, mm7 paddw mm0, mm1 paddw mm2, mm3 paddw mm4, mm0 paddw mm5, mm2 add esi, ecx dec eax jnz .loop1 paddusw mm4, mm5 pmaddwd mm4, [mmx_one] ; merge sum movq mm5, mm4 psrlq mm5, 32 paddd mm4, mm5 psllq mm4, 32 ; blank upper dword psrlq mm4, 32 + 8 ; mm4 /= (16*16) punpckldq mm4, mm4 packssdw mm4, mm4 ; mm4 = mean pxor mm6, mm6 ; mm6 = dev = 0 mov eax, 16.loop2 movq mm0, [edi] movq mm2, [edi + 8] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7 punpckhbw mm1, mm7 ; mm01 = cur punpckhbw mm3, mm7 ; mm23 = cur2 movq mm5, mm4 ; psubusw mm5, mm0 ; psubusw mm0, mm4 ; por mm0, mm5 ; movq mm5, mm4 ; psubusw mm5, mm1 ; psubusw mm1, mm4 ; por mm1, mm5 ; mm01 = |mm01 - mm4| movq mm5, mm4 ; psubusw mm5, mm2 ; psubusw mm2, mm4 ; por mm2, mm5 ; movq mm5, mm4 ; psubusw mm5, mm3 ; psubusw mm3, mm4 ; por mm3, mm5 ; mm23 = |mm23 - mm4| paddw mm0, mm1 paddw mm2, mm3 paddw mm6, mm0 paddw mm6, mm2 ; dev += mm01 + mm23 add edi, ecx dec eax jnz .loop2 pmaddwd mm6, [mmx_one] ; merge dev movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 pop edi pop esi ret;===========================================================================;; uint32_t dev16_xmm(const uint8_t * const cur,; const uint32_t stride);;; experimental!;;===========================================================================align 16cglobal dev16_xmmdev16_xmm push esi push edi pxor mm4, mm4 ; mm23 = sum = 0 mov esi, [esp + 8 + 4] ; cur mov ecx, [esp + 8 + 8] ; stride mov edi, esi; mov eax, 16 pxor mm7, mm7 ; mm7 = 0;.loop1 movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 ; abs(cur0 - 0) + abs(cur1 - 0) + ... + abs(cur7 - 0) -> mm0 psadbw mm2, mm7 ; abs(cur8 - 0) + abs(cur9 - 0) + ... + abs(cur15 - 0) -> mm2 paddw mm4,mm0 ; mean += mm0 paddw mm4,mm2 ; mean += mm2 add esi, ecx; dec eax; jnz .loop1 movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 add esi, ecx movq mm0, [esi] movq mm2, [esi + 8] psadbw mm0, mm7 psadbw mm2, mm7 paddw mm4,mm0 paddw mm4,mm2 movq mm5, mm4 psllq mm5, 32 paddd mm4, mm5 psrld mm4, 8 packssdw mm4, mm4 packuswb mm4, mm4 pxor mm6, mm6 ; mm6 = dev = 0; mov eax, 16;.loop2 movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 ; mm0 = |cur - mean| psadbw mm2, mm4 ; mm0 = |cur2 - mean| paddw mm6,mm0 ; dev += mm01 paddw mm6,mm2 ; dev += mm23 add edi, ecx; dec eax; jnz .loop2 movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 add edi, ecx movq mm0, [edi] movq mm2, [edi + 8] psadbw mm0, mm4 psadbw mm2, mm4 paddw mm6,mm0 paddw mm6,mm2 movq mm7, mm6 psllq mm7, 32 paddd mm6, mm7 movd eax, mm6 pop edi pop esi ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -