📄 mefunctions_mmx.asm
字号:
psllq mm5, 32
psrlq mm5, 32 + 8 ; get mean of MB
punpckldq mm5, mm5
mov ecx, 16
packssdw mm5, mm5 ; mm5 = mean; scatter mean into four words
loop4dev:
movq mm2, [esi + 8] ; second half of data in the same row
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6 ; byte -> word
punpcklbw mm2, mm6
punpckhbw mm1, mm6 ; mm01 = cur
movq mm4, mm5 ; mm45 = mean
punpckhbw mm3, mm6 ; mm23 = cur2
psubusw mm4, mm0
psubusw mm0, mm5
por mm0, mm4 ; mm0 = |mm0-mean|
movq mm4, mm5
paddw mm7, mm0 ; dev += mm0
psubusw mm4, mm1
psubusw mm1, mm5
por mm1, mm4 ; mm1 = |mm1 - mm4|
movq mm4, mm5
paddw mm7, mm1 ; dev += mm1
psubusw mm4, mm2
psubusw mm2, mm5
por mm2, mm4 ; mm2 = |mm2 - mean|
movq mm4, mm5
paddw mm7, mm2 ; dev += mm2
psubusw mm4, mm3
psubusw mm3, mm5
por mm3, mm4 ; mm3 = |mm3 - mean|
add esi, eax
paddw mm7, mm3 ; dev += mm3
movq mm0, [esi]
movq mm2, [esi + 8] ; second half of data in the same row
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6 ; byte -> word
punpcklbw mm2, mm6
punpckhbw mm1, mm6 ; mm01 = cur
movq mm4, mm5 ; mm45 = mean
punpckhbw mm3, mm6 ; mm23 = cur2
psubusw mm4, mm0
psubusw mm0, mm5
por mm0, mm4 ; mm0 = |mm0-mean|
movq mm4, mm5
paddw mm7, mm0 ; dev += mm0
psubusw mm4, mm1
psubusw mm1, mm5
por mm1, mm4 ; mm1 = |mm1 - mm4|
movq mm4, mm5
paddw mm7, mm1 ; dev += mm1
psubusw mm4, mm2
psubusw mm2, mm5
por mm2, mm4 ; mm2 = |mm2 - mean|
movq mm4, mm5
paddw mm7, mm2 ; dev += mm2
psubusw mm4, mm3
psubusw mm3, mm5
por mm3, mm4 ; mm3 = |mm3 - mean|
add esi, eax
paddw mm7, mm3 ; dev += mm3
sub ecx, 2
movq mm0, [esi] ; load data
jnz loop4dev
pmaddwd mm7, [mmx_one] ; merge dev
mov esi, [esp+8+12]
movq mm0, mm7
punpcklwd mm5, mm6
psrlq mm0, 32
movd [esi], mm5
paddd mm7, mm0
pop esi
movd eax, mm7
pop edi
ret
;===========================================================================
;
; uint32_t dev16_xmm(const uint8_t * const cur,
; const uint32_t stride,
; uint32_t * const mean);
;
;===========================================================================
align 64
global _dev16_xmm
_dev16_xmm
push edi
push esi
mov esi, [esp + 8 + 4] ; cur
mov eax, [esp + 8 + 8] ; stride
pxor mm6, mm6 ; mm6 = 0
pxor mm4, mm4 ; mm4 = sum = 0
pxor mm5, mm5 ; mm5 = sum2 = 0
mov edi, esi
mov ecx, 16
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //2
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //4
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //6
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //8
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //10
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //12
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
add esi, eax
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
add esi, eax
paddusw mm4, mm2
;// next two rows ... //14
movq mm0, [esi]
movq mm2, [esi + 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
paddusw mm4, mm2
;// next row
movq mm0, [esi+eax]
movq mm2, [esi +eax+ 8]
psadbw mm0, mm6
psadbw mm2, mm6
paddusw mm5, mm0
paddusw mm4, mm2
;// next two rows ... //16, done!!!
pxor mm7, mm7
paddusw mm5, mm4
mov esi, edi
psllq mm5, 32 ; clean left 32 bits
movq mm0, [esi] ; first row of data
psrlq mm5, 32 + 8 ; get mean of MB
mov ecx, eax
punpckldq mm5, mm5
shl ecx, 1 ; ecx = 2*stride
movq mm2, [esi + 8] ; second half of data in the same row
packssdw mm5, mm5 ; mm5 = mean; scatter mean into four words
movq mm1, [esi+eax] ; load data in next row into mm13
packuswb mm5, mm5
movq mm3, [esi+eax+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
paddusw mm7, mm0 ; dev += mm0
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
add esi, ecx ; start to work on 2nd and 3rd row ...
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
add esi, ecx ; start to work on 4th and 5th row ...
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
add esi, ecx ; start to work on 6th and 7th row ...
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
add esi, ecx ; start to work on 8th and 9th row ...
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
add esi, ecx ; start to work on 10th and 11th row ...
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
add esi, ecx ; start to work on 12th and 13th row ...
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
paddusw mm6, mm1 ; accumulate mm6, mm7
add esi, ecx ; start to work on 14th and 15th rows
paddusw mm7, mm3
movq mm0, [esi]
movq mm2, [esi+8]
psadbw mm0, mm5 ; mm0 = |mm0-mean|
psadbw mm2, mm5 ; mm2 = |mm2-mean|
movq mm1, [esi+eax] ; load data in next row into mm13
paddusw mm7, mm0 ; dev += mm0
movq mm3, [esi+eax+8]
paddusw mm6, mm2 ; dev += mm1
psadbw mm1, mm5
psadbw mm3, mm5
paddusw mm6, mm1 ; accumulate mm6, mm7
paddusw mm7, mm3
pxor mm0, mm0
mov edi, [esp+8+12]
punpcklwd mm5, mm0
paddusw mm7, mm6
movd [edi], mm5 ; return mean
pop esi
movd eax, mm7 ; sad
pop edi
ret
;===========================================================================
;
; uint32_t dev16_sse2(const uint8_t * const cur,
; const uint32_t stride,
; uint32_t * const mean);
;
;===========================================================================
align 64
global _dev16_sse2
_dev16_sse2
push edi
push esi
mov esi, [esp + 8 + 4] ; cur
mov eax, [esp + 8 + 8] ; stride
pxor xmm5, xmm5
mov ecx, eax
pxor xmm6, xmm6
shl ecx, 1 ; ecx = 2*stride
mov edi, esi
movdqa xmm0, [esi] ; 1st row
movdqa xmm2, [esi+eax] ; 2nd row
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 3rd and 4th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
add esi, ecx ; start working on 5th and 6th row
movdqa xmm0, [esi]
paddw xmm5, xmm1
movdqa xmm2, [esi+eax]
paddw xmm5, xmm3
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 7th and 8th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
add esi, ecx ; start working on 9th and 10th row
movdqa xmm0, [esi]
paddw xmm5, xmm1
movdqa xmm2, [esi+eax]
paddw xmm5, xmm3
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 11th and 12th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
add esi, ecx ; start working on 13th and 14th row
movdqa xmm0, [esi]
paddw xmm5, xmm1
movdqa xmm2, [esi+eax]
paddw xmm5, xmm3
psadbw xmm0, xmm6
psadbw xmm2, xmm6
add esi, ecx ; start working on 15th and 16th row
movdqa xmm1, [esi]
paddw xmm5, xmm0
movdqa xmm3, [esi+eax]
paddw xmm5, xmm2
psadbw xmm1, xmm6
psadbw xmm3, xmm6
paddw xmm5, xmm1
add ecx, eax ; now ecx = 3*stride
paddw xmm5, xmm3
pxor xmm7, xmm7 ; xmm7 = dev = 0
movhlps xmm4, xmm5
paddw xmm5, xmm4
movdqa xmm0, [edi]
psrlw xmm5, 8
movdqa xmm1, [edi+eax]
movlhps xmm5, xmm5
movdqa xmm2, [edi+eax*2]
movdqa xmm4, xmm5
movdqa xmm3, [edi+ecx]
psllq xmm4, 32
add edi, eax
paddd xmm5, xmm4
add edi, ecx ; ready to load next four rows !!!, from 5th row
packssdw xmm5, xmm5
mov esi, [esp+8+12] ; ready to load value for mean
packuswb xmm5, xmm5 ; scatter mean over xmm4 in byte mode
psadbw xmm0, xmm5
psadbw xmm1, xmm5
paddw xmm7, xmm0
paddw xmm6, xmm1
psadbw xmm2, xmm5
psadbw xmm3, xmm5
movdqa xmm0, [edi] ; load 5th row
movdqa xmm1, [edi+eax] ; load 6th row
paddw xmm6, xmm2
paddw xmm7, xmm3 ; finish accumulation for last four rows
movdqa xmm2, [edi+eax*2] ; load 7th row
movdqa xmm3, [edi+ecx] ; load 8th row
add edi, eax
psadbw xmm0, xmm5
psadbw xmm1, xmm5
add edi, ecx ; ready to load next four rows !!!, from 9th row
paddw xmm6, xmm0
paddw xmm7, xmm1
psadbw xmm2, xmm5
psadbw xmm3, xmm5
paddw xmm6, xmm2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -