📄 mefunctions_mmx.asm
字号:
;/**************************************************************************
; *
; * History:
; *
; * 01.06.2002 imported from XVID MPEG-4 VIDEO CODEC sad_mmx.asm,
; * rewrote and added more functionality. Sigma Designs, Inc.
; *
; * History of original sad_mmx.asm:
; *
; * 17.11.2001 bugfix and small improvement for dev16_xmm,
; * removed terminate early in sad16_xmm
; * 12.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
; *
; *************************************************************************/
bits 32
section .data
align 16
mmx_one times 4 dw 1
mask_dc dw 0, -1, -1, -1
mmx_65280 times 2 dd 128*255
mmx_255 times 4 dw 255
section .text
;===========================================================================
;
; uint32_t sad16x16_mmx(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride)
;===========================================================================
align 64
global _sad16x16_mmx
_sad16x16_mmx
push edi
push esi
mov esi, [esp + 8 + 4] ; Current MB (Estimated MB)
mov edi, [esp + 8 + 8] ; reference MB
mov eax, [esp + 8 + 12] ; stride
pxor mm7, mm7 ; mm7 = sum = 0
pxor mm6, mm6 ; mm6 = 0
movq mm1, [esi] ; 8 pixels for current MB
movq mm3, [esi+8] ; another 8 pixels in the same row of current MB
mov ecx, 16
sad16_mmx_loop:
movq mm0, [edi] ; 8 pixels for reference MB
movq mm2, [edi+8] ; another 8 pixels in the same row of reference MB
movq mm4, mm1
movq mm5, mm3
psubusb mm1, mm0
psubusb mm3, mm2
add esi, eax
add edi, eax
psubusb mm0, mm4
psubusb mm2, mm5
por mm0, mm1 ; mm0 = |cur - ref|
por mm2, mm3 ; mm2 = |*(cur+8) - *(ref+8)|
movq mm1,mm0
movq mm3,mm2
punpcklbw mm0,mm6
punpckhbw mm1,mm6
punpcklbw mm2,mm6
punpckhbw mm3,mm6
paddusw mm0,mm1
paddusw mm2,mm3
paddusw mm7,mm0 ; sum += mm01
movq mm1, [esi] ; 8 pixels for current MB
paddusw mm7,mm2 ; sum += mm23
movq mm3, [esi+8] ; another 8 pixels in the same row of current MB
;// start next row's processing
movq mm0, [edi] ; 8 pixels for reference MB
movq mm2, [edi+8] ; another 8 pixels in the same row of reference MB
movq mm4, mm1
movq mm5, mm3
psubusb mm1, mm0
psubusb mm3, mm2
add esi, eax
add edi, eax
psubusb mm0, mm4
psubusb mm2, mm5
por mm0, mm1 ; mm0 = |cur - ref|
por mm2, mm3 ; mm2 = |*(cur+8) - *(ref+8)|
movq mm1,mm0
movq mm3,mm2
punpcklbw mm0,mm6
punpckhbw mm1,mm6
punpcklbw mm2,mm6
punpckhbw mm3,mm6
paddusw mm0,mm1
paddusw mm2,mm3
paddusw mm7,mm0 ; sum += mm01
movq mm1, [esi] ; 8 pixels for current MB
paddusw mm7,mm2 ; sum += mm23
sub ecx, 2 ; unlooped two rows' processing
movq mm3, [esi+8] ; another 8 pixels in the same row of current MB
jnz sad16_mmx_loop
pmaddwd mm7, [mmx_one] ; merge sum
pop esi
movq mm0, mm7
pop edi
psrlq mm7, 32
paddd mm0, mm7
movd eax, mm0
ret
;===========================================================================
;
; uint32_t sad16x16_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride)
;
;===========================================================================
align 64
global _sad16x16_xmm
_sad16x16_xmm
push edi
push esi
push ebx
mov esi, [esp + 12 + 4] ; Current MB (Estimated MB)
mov edi, [esp + 12 + 8] ; reference MB
mov eax, [esp + 12 + 12] ; stride
mov ecx, 16
mov ebx, eax
pxor mm7, mm7 ; mm7 = sum = 0
add ebx, ebx
pxor mm6, mm6 ; mm6 = sum2 = 0
sad16_xmm_loop:
movq mm0, [esi] ; 8 pixels for current MB
movq mm2, [esi+8] ; another 8 pixels in the same row of current MB
movq mm3, [esi+eax]
movq mm4, [esi+eax+8]
psadbw mm0, [edi] ; mm0 = |cur - ref|
psadbw mm2, [edi+8] ; mm2 = |*(cur+8) - *(ref+8)|
paddusw mm6,mm0 ; sum += mm01
paddusw mm7,mm2 ; sum += mm23
psadbw mm3, [edi+eax] ; mm3 = |*(cur+stride) - *(ref+stride)|
psadbw mm4, [edi+eax+8] ; mm4 = |*(cur+stride+8) - *(ref+stride+8)|
add esi, ebx
add edi, ebx
paddusw mm6,mm3 ; sum += mm3
paddusw mm7,mm4 ; sum2 += mm4
movq mm0, [esi] ; 8 pixels for current MB
movq mm2, [esi+8] ; another 8 pixels in the same row of current MB
movq mm3, [esi+eax]
movq mm4, [esi+eax+8]
psadbw mm0, [edi] ; mm0 = |cur - ref|
psadbw mm2, [edi+8] ; mm2 = |*(cur+8) - *(ref+8)|
paddusw mm6,mm0 ; sum += mm0
paddusw mm7,mm2 ; sum += mm2
psadbw mm3, [edi+eax] ; mm3 = |*(cur+stride) - *(ref+stride)|
psadbw mm4, [edi+eax+8] ; mm4 = |*(cur+stride+8) - *(ref+stride+8)|
add esi, ebx
add edi, ebx
paddusw mm6,mm3 ; sum += mm3
paddusw mm7,mm4 ; sum2 += mm4
sub ecx, 4
jnz sad16_xmm_loop
paddusw mm7, mm6
pop ebx
pop esi
pop edi
movd eax, mm7
ret
;===========================================================================
;
; uint32_t sad16x16_sse2(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride)
;
;===========================================================================
align 64
global _sad16x16_sse2
_sad16x16_sse2
push edi
push esi
mov esi, [esp + 8 + 4] ; Current MB (Estimated MB)
mov edi, [esp + 8 + 8] ; reference MB
mov eax, [esp + 8 + 12] ; stride
mov ecx, eax
shl ecx, 1 ; ecx = stride*2
pxor xmm6, xmm6
pxor xmm7, xmm7 ; xmm7 = sum =0
add ecx, eax ; ecx = stride * 3 now.
;// first 4 rows
movdqu xmm0, [edi] ; load ref line 0, 16 pixels
movdqu xmm1, [edi+eax] ; load ref line 1, 16 pixels
psadbw xmm0, [esi] ; |delta of line0|
psadbw xmm1, [esi+eax] ; |delta of line1|
movdqu xmm2, [edi+eax*2] ; load ref line 2, 16 pixels
movdqu xmm3, [edi+ecx] ; load ref line 3, 16 pixels
paddusw xmm6, xmm0
paddusw xmm7, xmm1
psadbw xmm2, [esi+eax*2] ; |delta of line2|
psadbw xmm3, [esi+ecx] ; |delta of line3|
add esi, eax
add edi, eax
paddusw xmm6, xmm2
paddusw xmm7, xmm3
add esi, ecx
add edi, ecx
;// next 4 rows
movdqu xmm0, [edi] ; load ref line 0, 16 pixels
movdqu xmm1, [edi+eax] ; load ref line 1, 16 pixels
psadbw xmm0, [esi] ; |delta of line0|
psadbw xmm1, [esi+eax] ; |delta of line1|
movdqu xmm2, [edi+eax*2] ; load ref line 2, 16 pixels
movdqu xmm3, [edi+ecx] ; load ref line 3, 16 pixels
paddusw xmm6, xmm0
paddusw xmm7, xmm1
psadbw xmm2, [esi+eax*2] ; |delta of line2|
psadbw xmm3, [esi+ecx] ; |delta of line3|
add esi, eax
add edi, eax
paddusw xmm6, xmm2
paddusw xmm7, xmm3
add esi, ecx
add edi, ecx
;// next 4 rows
movdqu xmm0, [edi] ; load ref line 0, 16 pixels
movdqu xmm1, [edi+eax] ; load ref line 1, 16 pixels
psadbw xmm0, [esi] ; |delta of line0|
psadbw xmm1, [esi+eax] ; |delta of line1|
movdqu xmm2, [edi+eax*2] ; load ref line 2, 16 pixels
movdqu xmm3, [edi+ecx] ; load ref line 3, 16 pixels
paddusw xmm6, xmm0
paddusw xmm7, xmm1
psadbw xmm2, [esi+eax*2] ; |delta of line2|
psadbw xmm3, [esi+ecx] ; |delta of line3|
add esi, eax
add edi, eax
paddusw xmm6, xmm2
paddusw xmm7, xmm3
add esi, ecx
add edi, ecx
;// last 4 rows
movdqu xmm0, [edi] ; load ref line 0, 16 pixels
movdqu xmm1, [edi+eax] ; load ref line 1, 16 pixels
psadbw xmm0, [esi] ; |delta of line0|
psadbw xmm1, [esi+eax] ; |delta of line1|
movdqu xmm2, [edi+eax*2] ; load ref line 2, 16 pixels
movdqu xmm3, [edi+ecx] ; load ref line 3, 16 pixels
paddusw xmm6, xmm0
paddusw xmm7, xmm1
psadbw xmm2, [esi+eax*2] ; |delta of line2|
psadbw xmm3, [esi+ecx] ; |delta of line3|
paddusw xmm6, xmm2
paddusw xmm7, xmm3
;// get the final result SAD
movhlps xmm0, xmm6 ; move high 8 xmm6 bytes to low 8 xmm0 bytes
movhlps xmm1, xmm7 ; move high 8 xmm7 bytes to low 8 xmm1 bytes
paddusw xmm6, xmm0
paddusw xmm7, xmm1
pop esi
paddusw xmm6, xmm7
pop edi
movd eax, xmm6 ; return sum
ret
;===========================================================================
;
; uint32_t sad8_mmx(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride);
;===========================================================================
align 64
global _sad8_mmx
_sad8_mmx
push edi
push esi
mov esi, [esp + 8 + 4] ; block in cur vop
mov edi, [esp + 8 + 8] ; block in ref vop
mov eax, [esp + 8 + 12] ; stride
pxor mm6, mm6 ; mm6 = sum = 0
pxor mm7, mm7 ; mm7 = 0
mov ecx, 4
sad8_mmx_lp:
movq mm0, [esi] ; ref
movq mm1, [edi] ; cur
add esi, eax
add edi, eax
movq mm2, [esi] ; ref2
movq mm3, [edi] ; cur2
movq mm4, mm0
movq mm5, mm2
psubusb mm0, mm1
psubusb mm2, mm3
psubusb mm1, mm4
psubusb mm3, mm5
por mm0, mm1 ; mm0 = |ref - cur|
por mm2, mm3 ; mm2 = |*(ref+stride) - *(cur+stride)|
movq mm1,mm0
movq mm3,mm2
punpcklbw mm0,mm7
punpcklbw mm2,mm7
punpckhbw mm1,mm7
punpckhbw mm3,mm7
paddusw mm0,mm1
paddusw mm2,mm3
paddusw mm6,mm0 ; sum += mm01
add esi, eax
add edi, eax
paddusw mm6,mm2 ; sum += mm23
dec ecx
jnz sad8_mmx_lp
pmaddwd mm6, [mmx_one] ; merge sum
pop esi
movq mm7, mm6
pop edi
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
ret
;===========================================================================
;
; uint32_t sad8_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride);
;
;===========================================================================
align 64
global _sad8_xmm
_sad8_xmm
push esi
push edi
mov esi, [esp + 8 + 4] ; block in cur vop
mov edi, [esp + 8 + 8] ; block in ref vop
mov eax, [esp + 8 + 12] ; stride
pxor mm6, mm6 ; mm6 = sum = 0
mov ecx, eax
movq mm0, [edi] ; load data from ref block
shl ecx, 1 ; ecx = 2*stride
movq mm2, [edi+eax] ; load next row of data from ref block
psadbw mm0, [esi] ; mm0 = |ref - cur|
psadbw mm2, [esi+eax] ; mm0 = |ref2 - cur2|
add esi, ecx
paddusw mm6, mm0 ; sum += mm0
add edi, ecx
paddusw mm6, mm2 ; sum += mm2
;--- next two rows
movq mm0, [edi] ; load data from ref block
movq mm2, [edi+eax] ; load next row of data from ref block
psadbw mm0, [esi] ; mm0 = |ref - cur|
psadbw mm2, [esi+eax] ; mm0 = |ref2 - cur2|
add esi, ecx
paddusw mm6, mm0 ; sum += mm0
add edi, ecx
paddusw mm6, mm2 ; sum += mm2
; next two rows
movq mm0, [edi] ; load data from ref block
movq mm2, [edi+eax] ; load next row of data from ref block
psadbw mm0, [esi] ; mm0 = |ref - cur|
psadbw mm2, [esi+eax] ; mm0 = |ref2 - cur2|
add esi, ecx
paddusw mm6, mm0 ; sum += mm0
add edi, ecx
paddusw mm6, mm2 ; sum += mm2
; next two rows
movq mm0, [edi] ; load data from ref block
movq mm2, [edi+eax] ; load next row of data from ref block
psadbw mm0, [esi] ; mm0 = |ref - cur|
psadbw mm2, [esi+eax] ; mm0 = |ref2 - cur2|
add esi, ecx
paddusw mm6, mm0 ; sum += mm0
add edi, ecx
paddusw mm6, mm2 ; sum += mm2
pop edi
movd eax, mm6
pop esi
ret
;===========================================================================
;
; uint32_t get_cbp_mmx(const int16_t coeff[6][64]);
;===========================================================================
align 64
global _get_cbp_mmx
_get_cbp_mmx
push ebx
push esi
mov esi, [esp + 8 + 4] ; coeff
xor eax, eax ; reset cbp
movq mm2, [mask_dc]
mov ecx, 6
align 8
cbp_loop:
movq mm0, [esi]
movq mm1, [esi+8]
pand mm0, mm2
por mm0, [esi+16]
por mm1, [esi+24]
por mm0, [esi+32]
por mm1, [esi+40]
por mm0, [esi+48]
por mm1, [esi+56]
por mm0, [esi+64]
por mm1, [esi+72]
por mm0, [esi+80]
por mm1, [esi+88]
por mm0, [esi+96]
por mm1, [esi+104]
por mm0, [esi+112]
por mm1, [esi+120]
por mm0, mm1
movq mm1, mm0
psrlq mm1, 32
por mm0, mm1
movd ebx, mm0
add esi, 128
or ebx, ebx
jz short continue_other_blocks
; cbp |= 1 << (ecx-1)
lea ebx, [ecx-1]
bts eax,ebx
continue_other_blocks:
dec ecx
jnz short cbp_loop
pop esi
pop ebx
ret
;===========================================================================
;
; uint32_t dev16_mmx(const uint8_t * const cur,
; const uint32_t stride, uint32_t * const mean);
;===========================================================================
align 64
global _dev16_mmx
_dev16_mmx
push edi
push esi
mov esi, [esp + 8 + 4] ; cur
mov eax, [esp + 8 + 8] ; stride
pxor mm6, mm6 ; mm6 = 0
pxor mm4, mm4 ; mm4 = sum = 0
pxor mm5, mm5 ; mm5 = sum2 = 0
mov edi, esi
mov ecx, 16
dev16_avg_loop:
movq mm0, [esi]
movq mm2, [esi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpckhbw mm1, mm6
punpcklbw mm2, mm6
punpckhbw mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm0
paddw mm5, mm2
add esi, eax
;// next row
movq mm0, [esi]
movq mm2, [esi + 8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpckhbw mm1, mm6
punpcklbw mm2, mm6
punpckhbw mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm0
paddw mm5, mm2
add esi, eax
sub ecx, 2 ; In each loop two rows are added
jnz dev16_avg_loop
paddusw mm4, mm5
pmaddwd mm4, [mmx_one]
pxor mm7, mm7
movq mm5, mm4
mov esi, edi
psrlq mm5, 32
movq mm0, [esi] ; first row of data
paddd mm5, mm4 ; merge sum+sum2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -