📄 mefunctions_mmx.asm
字号:
paddw xmm7, xmm3 ; add result of 8th row
movdqa xmm0, [edi] ; load 9th row
movdqa xmm1, [edi+eax] ; load 10th row
psadbw xmm0, xmm5
psadbw xmm1, xmm5
movdqa xmm2, [edi+eax*2] ; load 11th row
movdqa xmm3, [edi+ecx] ; load 12th row
psadbw xmm2, xmm5
psadbw xmm3, xmm5
paddw xmm6, xmm0
paddw xmm7, xmm1
add edi, eax
paddw xmm6, xmm2
add edi, ecx ; ready to load next four rows !!!, from 13th row
paddw xmm7, xmm3
movdqa xmm0, [edi]
movdqa xmm1, [edi+eax]
psadbw xmm0, xmm5
psadbw xmm1, xmm5
movdqa xmm2, [edi+eax*2]
movdqa xmm3, [edi+ecx]
psadbw xmm2, xmm5
psadbw xmm3, xmm5
paddw xmm6, xmm0
paddw xmm7, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
;------------------------------------------
pxor xmm0, xmm0
paddw xmm7, xmm6
punpcklbw xmm5, xmm0
movhlps xmm6, xmm7
punpcklwd xmm5, xmm0
paddd xmm7, xmm6
movd [esi], xmm5
movd eax, xmm7
pop esi
pop edi
ret
;===========================================================================
;
; uint32_t dev16_mv0_mmx(const uint8_t * const cur,const uint8_t * const ref,
; const uint32_t stride);
;
;===========================================================================
align 64
global _dev16_mv0_mmx
_dev16_mv0_mmx
push esi
push edi
push ebx
push edx
mov esi, [esp + 16 + 4] ; cur
pxor mm7, mm7 ; mm7 = 0
mov edi, [esp + 16 + 8] ; ref
pxor mm5, mm5
mov ecx, [esp + 16 + 12] ; stride
pxor mm4, mm4 ; mm45 = sum = 0
mov ebx, esi
mov edx, edi
mov eax, 16
get_avg_mmx_lp:
movq mm0, [esi]
movq mm2, [esi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpcklbw mm2, mm7
punpckhbw mm1, mm7
punpckhbw mm3, mm7
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm0
paddw mm5, mm2
add esi, ecx
dec eax
jnz short get_avg_mmx_lp
;; subtract the ref MB
mov eax, 16
.loop1a
movq mm0, [edi]
movq mm2, [edi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpcklbw mm2, mm7
punpckhbw mm1, mm7
punpckhbw mm3, mm7
paddw mm0, mm1
paddw mm2, mm3
psubw mm4, mm0
psubw mm5, mm2
add edi, ecx
dec eax
jnz .loop1a
paddw mm4, mm5
pmaddwd mm4, [mmx_one] ; merge sum
paddd mm4, [mmx_65280]; it should be positive two double-word
movq mm5, mm4
psrlq mm5, 32
paddd mm4, mm5
psllq mm4, 32 ; blank upper dword
psrlq mm4, 32 + 8 ; mm4 /= (16*16)
punpckldq mm4, mm4
packssdw mm4, mm4 ; mm4 = mean
mov esi, ebx
movq mm7, [mmx_255]
mov edi, edx
pxor mm6, mm6 ; mm6 = dev = 0
mov eax, 16
.loop2
movq mm0, [esi]
pxor mm5, mm5
movq mm1, mm0
punpcklbw mm0, mm5
punpckhbw mm1, mm5 ; mm01 = cur
movq mm2, [edi]
movq mm3, mm2
punpcklbw mm2, mm5
punpckhbw mm3, mm5 ; mm23 = ref
psubw mm0, mm2
psubw mm1, mm3
paddw mm0, mm7
paddw mm1, mm7
movq mm5, mm4 ;
psubusw mm5, mm0 ;
psubusw mm0, mm4 ;
por mm0, mm5 ;
movq mm5, mm4 ;
psubusw mm5, mm1 ;
psubusw mm1, mm4 ;
por mm1, mm5 ; mm01 = |mm01 - mm4|
paddw mm6, mm0
paddw mm6, mm1 ; dev += mm01
movq mm0, [esi+8]
pxor mm5, mm5
movq mm1, mm0
punpcklbw mm0, mm5
punpckhbw mm1, mm5 ; mm01 = cur
movq mm2, [edi+8]
movq mm3, mm2
punpcklbw mm2, mm5
punpckhbw mm3, mm5 ; mm23 = ref
psubw mm0, mm2
psubw mm1, mm3
paddw mm0, mm7
paddw mm1, mm7
movq mm5, mm4 ;
psubusw mm5, mm0 ;
psubusw mm0, mm4 ;
por mm0, mm5 ;
movq mm5, mm4 ;
psubusw mm5, mm1 ;
psubusw mm1, mm4 ;
por mm1, mm5 ; mm01 = |mm01 - mm4|
paddw mm6, mm0
paddw mm6, mm1 ; dev += mm01
add edi, ecx
add esi, ecx
dec eax
jnz .loop2
.done
pmaddwd mm6, [mmx_one] ; merge dev
movq mm7, mm6
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
pop edx
pop ebx
pop edi
pop esi
ret
;===========================================================================
;
; uint32_t mean16_mmx(const uint8_t * const cur,
; const uint32_t stride);
;
;===========================================================================
align 64
global _mean16_mmx
_mean16_mmx
push edi
push esi
mov esi, [esp + 8 + 4] ; cur
mov eax, [esp + 8 + 8] ; stride
pxor mm6, mm6 ; mm6 = 0
pxor mm4, mm4 ; mm4 = sum = 0
pxor mm5, mm5 ; mm5 = sum2 = 0
mov edi, esi
mov ecx, 16
mean16_avg_loop:
movq mm0, [esi]
movq mm2, [esi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpckhbw mm1, mm6
punpcklbw mm2, mm6
punpckhbw mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm0
paddw mm5, mm2
add esi, eax
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpckhbw mm1, mm6
punpcklbw mm2, mm6
punpckhbw mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm0
paddw mm5, mm2
add esi, eax
sub ecx, 2 ; In each loop two rows are added
jnz mean16_avg_loop
paddusw mm4, mm5
pmaddwd mm4, [mmx_one] ; merge sum
movq mm5, mm4
psrlq mm5, 32
paddd mm4, mm5
psllq mm4, 32 ; blank upper dword
pop esi
psrlq mm4, 32 + 8 ; mm4 /= (16*16)
pop edi
movd eax, mm4
ret
;===========================================================================
;
; uint32_t mean16_xmm(const uint8_t * const cur,
; const uint32_t stride);
;
;===========================================================================
align 64
global _mean16_xmm
_mean16_xmm
push esi
mov esi, [esp + 4 + 4] ; cur
mov eax, [esp + 4 + 8] ; stride
pxor mm6, mm6 ; mm6 = 0
pxor mm4, mm4 ; mm4 = sum = 0
pxor mm5, mm5 ; mm5 = sum2 = 0
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //2
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //4
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //6
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //8
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //10
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //12
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //14
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi+eax]
movq mm2, [esi +eax+ 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
paddusw mm4, mm2
;// next two rows ... //16, done!!!
paddusw mm5, mm4
psllq mm5, 32
psrlq mm5, 32 + 8 ; get mean of MB
movd eax, mm5
pop esi
ret
;===========================================================================
;
; uint32_t mean16_sse2(const uint8_t * const cur,
; const uint32_t stride, uint32_t * const mean);
;
;===========================================================================
align 64
global _mean16_sse2
_mean16_sse2
push esi
mov esi, [esp + 4 + 4] ; cur
mov eax, [esp + 4 + 8] ; stride
pxor xmm5, xmm5
pxor xmm6, xmm6
shl ecx, 1 ; ecx = 2*stride
movdqa xmm0, [esi] ; 1st row
movdqa xmm2, [esi+eax] ; 2nd row
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 3rd and 4th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
add esi, ecx ; start working on 5th and 6th row
movdqa xmm0, [esi]
paddw xmm5, xmm1
movdqa xmm2, [esi+eax]
paddw xmm5, xmm3
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 7th and 8th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
add esi, ecx ; start working on 9th and 10th row
movdqa xmm0, [esi]
paddw xmm5, xmm1
movdqa xmm2, [esi+eax]
paddw xmm5, xmm3
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 11th and 12th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
add esi, ecx ; start working on 13th and 14th row
movdqa xmm0, [esi]
paddw xmm5, xmm1
movdqa xmm2, [esi+eax]
paddw xmm5, xmm3
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 15th and 16th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
paddw xmm5, xmm1
add ecx, eax ; now ecx = 3*stride
paddw xmm5, xmm3
movhlps xmm4, xmm5
paddw xmm4, xmm5
movlhps xmm4, xmm4
movdqa xmm5, xmm4
psllq xmm5, 32
paddd xmm4, xmm5
psllq xmm4, 32 ; blank upper dword
psrlq xmm4, 32 + 8 ; mm4 /= (16*16)
pop esi
movd eax, xmm4
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -