⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mefunctions_mmx.asm

📁 经典的MP4编解码核心库
💻 ASM
📖 第 1 页 / 共 3 页
字号:
;/**************************************************************************
; *
; *	History:
; *
; * 01.06.2002  imported from XVID MPEG-4 VIDEO CODEC sad_mmx.asm, 
; *             rewrote and added more functionality.  Sigma Designs, Inc.
; *
; * History of original sad_mmx.asm:
; *
; * 17.11.2001  bugfix and small improvement for dev16_xmm,
; *             removed terminate early in sad16_xmm 
; *	12.11.2001	inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
; *
; *************************************************************************/


bits 32

section .data

align 16

mmx_one		times 4	dw 1
mask_dc		dw	0, -1, -1, -1
mmx_65280	times 2 dd 128*255
mmx_255		times 4 dw 255

section .text

;===========================================================================
;
; uint32_t sad16x16_mmx(const uint8_t * const cur,
;					    const uint8_t * const ref,
;					    const uint32_t stride)
;===========================================================================

align 64
global _sad16x16_mmx
_sad16x16_mmx
		push edi
		push esi

		mov esi, [esp + 8 + 4]	; Current MB (Estimated MB)
		mov edi, [esp + 8 + 8]	; reference MB
		mov eax, [esp + 8 + 12]	; stride

		pxor mm7, mm7			; mm7 = sum = 0
		pxor mm6, mm6			; mm6 = 0

		movq mm1, [esi]			; 8 pixels for current MB
		movq mm3, [esi+8]		; another 8 pixels in the same row of current MB

		mov	ecx, 16
				
sad16_mmx_loop:
		movq mm0, [edi]			; 8 pixels for reference MB
		movq mm2, [edi+8]		; another 8 pixels in the same row of reference MB

		movq mm4, mm1
		movq mm5, mm3

		psubusb mm1, mm0
		psubusb mm3, mm2
		
		add	esi, eax
		add	edi, eax

		psubusb mm0, mm4
		psubusb mm2, mm5

		por mm0, mm1			; mm0 = |cur - ref|
		por mm2, mm3			; mm2 = |*(cur+8) - *(ref+8)|

		movq mm1,mm0
		movq mm3,mm2

		punpcklbw mm0,mm6
		punpckhbw mm1,mm6

		punpcklbw mm2,mm6
		punpckhbw mm3,mm6

		paddusw mm0,mm1
		paddusw mm2,mm3		

		paddusw mm7,mm0		; sum += mm01
		movq mm1, [esi]		; 8 pixels for current MB

		paddusw mm7,mm2		; sum += mm23
		movq mm3, [esi+8]	; another 8 pixels in the same row of current MB

		;// start next row's processing
		movq mm0, [edi]			; 8 pixels for reference MB
		movq mm2, [edi+8]		; another 8 pixels in the same row of reference MB

		movq mm4, mm1
		movq mm5, mm3

		psubusb mm1, mm0
		psubusb mm3, mm2
		
		add	esi, eax
		add	edi, eax

		psubusb mm0, mm4
		psubusb mm2, mm5

		por mm0, mm1			; mm0 = |cur - ref|
		por mm2, mm3			; mm2 = |*(cur+8) - *(ref+8)|

		movq mm1,mm0
		movq mm3,mm2

		punpcklbw mm0,mm6
		punpckhbw mm1,mm6

		punpcklbw mm2,mm6
		punpckhbw mm3,mm6

		paddusw mm0,mm1
		paddusw mm2,mm3		

		paddusw mm7,mm0		; sum += mm01
		movq mm1, [esi]		; 8 pixels for current MB

		paddusw mm7,mm2		; sum += mm23
		sub		ecx, 2		; unlooped two rows' processing

		movq mm3, [esi+8]	; another 8 pixels in the same row of current MB
		
		jnz	sad16_mmx_loop

		pmaddwd mm7, [mmx_one]	; merge sum
		pop 	esi

		movq mm0, mm7
		pop 	edi
		psrlq mm7, 32 

		paddd mm0, mm7
		movd eax, mm0


		ret


;===========================================================================
;
; uint32_t sad16x16_xmm(const uint8_t * const cur,
;						const uint8_t * const ref,
;						const uint32_t stride)
;
;===========================================================================

align 64
global _sad16x16_xmm
_sad16x16_xmm
		push edi
		push esi
		push ebx

		mov esi, [esp + 12 + 4]		; Current MB (Estimated MB)
		mov edi, [esp + 12 + 8]		; reference MB
		mov eax, [esp + 12 + 12]	; stride
		mov	ecx, 16
		mov	ebx, eax
		pxor mm7, mm7				; mm7 = sum = 0

		add	ebx, ebx
		pxor mm6, mm6				; mm6 = sum2 = 0

sad16_xmm_loop:
		movq mm0, [esi]				; 8 pixels for current MB
		movq mm2, [esi+8]			; another 8 pixels in the same row of current MB

		movq mm3, [esi+eax]
		movq mm4, [esi+eax+8]

		psadbw mm0, [edi]			; mm0 = |cur - ref|
		psadbw mm2, [edi+8]			; mm2 = |*(cur+8) - *(ref+8)|
		
		paddusw mm6,mm0				; sum += mm01
		paddusw mm7,mm2				; sum += mm23

		psadbw mm3, [edi+eax]		; mm3 = |*(cur+stride) - *(ref+stride)|
		psadbw mm4, [edi+eax+8]		; mm4 = |*(cur+stride+8) - *(ref+stride+8)|

		add	esi, ebx
		add	edi, ebx
		
		paddusw mm6,mm3				; sum += mm3
		paddusw mm7,mm4				; sum2 += mm4

		movq mm0, [esi]				; 8 pixels for current MB
		movq mm2, [esi+8]			; another 8 pixels in the same row of current MB

		movq mm3, [esi+eax]
		movq mm4, [esi+eax+8]

		psadbw mm0, [edi]			; mm0 = |cur - ref|
		psadbw mm2, [edi+8]			; mm2 = |*(cur+8) - *(ref+8)|
		
		paddusw mm6,mm0				; sum += mm0
		paddusw mm7,mm2				; sum += mm2

		psadbw mm3, [edi+eax]			; mm3 = |*(cur+stride) - *(ref+stride)|
		psadbw mm4, [edi+eax+8]			; mm4 = |*(cur+stride+8) - *(ref+stride+8)|

		add	esi, ebx
		add	edi, ebx

		paddusw mm6,mm3				; sum += mm3
		paddusw mm7,mm4				; sum2 += mm4

		sub	ecx, 4
		jnz	sad16_xmm_loop

		paddusw	mm7, mm6

		pop		ebx
		pop 	esi
		pop 	edi

		movd eax, mm7

		ret


;===========================================================================
;
; uint32_t sad16x16_sse2(const uint8_t * const cur,
;						 const uint8_t * const ref,
;						 const uint32_t stride)
;
;===========================================================================
align 64
global _sad16x16_sse2
_sad16x16_sse2
		push	edi
		push	esi

		mov		esi, [esp + 8 + 4]		; Current MB (Estimated MB)
		mov		edi, [esp + 8 + 8]		; reference MB

		mov		eax, [esp + 8 + 12]		; stride
		mov		ecx, eax

		shl		ecx, 1					; ecx = stride*2
		pxor	xmm6, xmm6

		pxor	xmm7, xmm7				; xmm7 = sum =0
		add		ecx, eax				; ecx = stride * 3 now.

		;// first 4 rows
		movdqu	xmm0, [edi]				; load ref line 0, 16 pixels
		movdqu	xmm1, [edi+eax]			; load ref line 1, 16 pixels

		psadbw	xmm0, [esi]				; |delta of line0|
		psadbw	xmm1, [esi+eax]			; |delta of line1|

		movdqu	xmm2, [edi+eax*2]		; load ref line 2, 16 pixels
		movdqu	xmm3, [edi+ecx]			; load ref line 3, 16 pixels

		paddusw	xmm6, xmm0
		paddusw	xmm7, xmm1

		psadbw	xmm2, [esi+eax*2]		; |delta of line2|
		psadbw	xmm3, [esi+ecx]			; |delta of line3|

		add		esi, eax
		add		edi, eax

		paddusw	xmm6, xmm2
		paddusw	xmm7, xmm3

		add		esi, ecx
		add		edi, ecx

		;// next 4 rows
		movdqu	xmm0, [edi]				; load ref line 0, 16 pixels
		movdqu	xmm1, [edi+eax]			; load ref line 1, 16 pixels

		psadbw	xmm0, [esi]				; |delta of line0|
		psadbw	xmm1, [esi+eax]			; |delta of line1|

		movdqu	xmm2, [edi+eax*2]		; load ref line 2, 16 pixels
		movdqu	xmm3, [edi+ecx]			; load ref line 3, 16 pixels

		paddusw	xmm6, xmm0
		paddusw	xmm7, xmm1

		psadbw	xmm2, [esi+eax*2]		; |delta of line2|
		psadbw	xmm3, [esi+ecx]			; |delta of line3|

		add		esi, eax
		add		edi, eax

		paddusw	xmm6, xmm2
		paddusw	xmm7, xmm3

		add		esi, ecx
		add		edi, ecx

		;// next 4 rows
		movdqu	xmm0, [edi]				; load ref line 0, 16 pixels
		movdqu	xmm1, [edi+eax]			; load ref line 1, 16 pixels

		psadbw	xmm0, [esi]				; |delta of line0|
		psadbw	xmm1, [esi+eax]			; |delta of line1|

		movdqu	xmm2, [edi+eax*2]		; load ref line 2, 16 pixels
		movdqu	xmm3, [edi+ecx]			; load ref line 3, 16 pixels

		paddusw	xmm6, xmm0
		paddusw	xmm7, xmm1

		psadbw	xmm2, [esi+eax*2]		; |delta of line2|
		psadbw	xmm3, [esi+ecx]			; |delta of line3|

		add		esi, eax
		add		edi, eax

		paddusw	xmm6, xmm2
		paddusw	xmm7, xmm3

		add		esi, ecx
		add		edi, ecx

		;// last 4 rows
		movdqu	xmm0, [edi]				; load ref line 0, 16 pixels
		movdqu	xmm1, [edi+eax]			; load ref line 1, 16 pixels

		psadbw	xmm0, [esi]				; |delta of line0|
		psadbw	xmm1, [esi+eax]			; |delta of line1|

		movdqu	xmm2, [edi+eax*2]		; load ref line 2, 16 pixels
		movdqu	xmm3, [edi+ecx]			; load ref line 3, 16 pixels

		paddusw	xmm6, xmm0
		paddusw	xmm7, xmm1

		psadbw	xmm2, [esi+eax*2]		; |delta of line2|
		psadbw	xmm3, [esi+ecx]			; |delta of line3|

		paddusw	xmm6, xmm2
		paddusw	xmm7, xmm3

		;// get the final result SAD				
		movhlps	xmm0, xmm6				; move high 8 xmm6 bytes to low 8 xmm0 bytes
		movhlps	xmm1, xmm7				; move high 8 xmm7 bytes to low 8 xmm1 bytes

		paddusw	xmm6, xmm0
		paddusw	xmm7, xmm1

		pop 	esi
		paddusw	xmm6, xmm7

		pop 	edi
		movd	eax, xmm6				; return sum


		ret
		


;===========================================================================
;
; uint32_t sad8_mmx(const uint8_t * const cur,
;					const uint8_t * const ref,
;					const uint32_t stride);
;===========================================================================
align 64
global _sad8_mmx
_sad8_mmx
		push 	edi
		push 	esi

		mov 	esi, [esp + 8 + 4]	; block in cur vop
		mov 	edi, [esp + 8 + 8]	; block in ref vop

		mov 	eax, [esp + 8 + 12]	; stride		
		pxor	mm6, mm6			; mm6 = sum = 0

		pxor	mm7, mm7			; mm7 = 0
		mov		ecx, 4

sad8_mmx_lp:

		movq mm0, [esi]	; ref
		movq mm1, [edi]	; cur

		add	esi, eax
		add	edi, eax

		movq mm2, [esi]	; ref2
		movq mm3, [edi]	; cur2

		movq mm4, mm0 
		movq mm5, mm2

		psubusb mm0, mm1
		psubusb mm2, mm3
		
		psubusb mm1, mm4
		psubusb mm3, mm5

		por mm0, mm1			; mm0 = |ref - cur|
		por mm2, mm3			; mm2 = |*(ref+stride) - *(cur+stride)|

		movq mm1,mm0
		movq mm3,mm2

		punpcklbw mm0,mm7
		punpcklbw mm2,mm7

		punpckhbw mm1,mm7
		punpckhbw mm3,mm7

		paddusw mm0,mm1
		paddusw mm2,mm3
		
		paddusw mm6,mm0			; sum += mm01
		add	esi, eax

		add	edi, eax
		paddusw mm6,mm2			; sum += mm23

		dec	ecx
		jnz	sad8_mmx_lp

		pmaddwd mm6, [mmx_one]	; merge sum
		pop 	esi

		movq mm7, mm6
		pop 	edi

		psrlq mm7, 32 

		paddd mm6, mm7

		movd eax, mm6

		ret



;===========================================================================
;
; uint32_t sad8_xmm(const uint8_t * const cur,
;					const uint8_t * const ref,
;					const uint32_t stride);
;
;===========================================================================
align 64
global _sad8_xmm
_sad8_xmm
		push 	esi
		push 	edi

		mov 	esi, [esp + 8 + 4]	; block in cur vop
		mov 	edi, [esp + 8 + 8]	; block in ref vop

		mov 	eax, [esp + 8 + 12]	; stride
		pxor	mm6, mm6			; mm6 = sum = 0

		mov		ecx, eax
		movq	mm0, [edi]			; load data from ref block

		shl     ecx, 1				; ecx = 2*stride
		movq	mm2, [edi+eax]		; load next row of data from ref block

		psadbw	mm0, [esi]			; mm0 = |ref - cur|
		psadbw	mm2, [esi+eax]		; mm0 = |ref2 - cur2|
		
		add	esi, ecx
		paddusw mm6, mm0			; sum += mm0

		add	edi, ecx
		paddusw mm6, mm2			; sum += mm2
		
		;--- next two rows		
		movq	mm0, [edi]			; load data from ref block
		movq	mm2, [edi+eax]		; load next row of data from ref block

		psadbw	mm0, [esi]			; mm0 = |ref - cur|
		psadbw	mm2, [esi+eax]		; mm0 = |ref2 - cur2|
		
		add	esi, ecx
		paddusw mm6, mm0			; sum += mm0

		add	edi, ecx
		paddusw mm6, mm2			; sum += mm2

		; next two rows
		movq	mm0, [edi]			; load data from ref block
		movq	mm2, [edi+eax]		; load next row of data from ref block

		psadbw	mm0, [esi]			; mm0 = |ref - cur|
		psadbw	mm2, [esi+eax]		; mm0 = |ref2 - cur2|
		
		add	esi, ecx
		paddusw mm6, mm0			; sum += mm0

		add	edi, ecx
		paddusw mm6, mm2			; sum += mm2

		; next two rows
		movq	mm0, [edi]			; load data from ref block
		movq	mm2, [edi+eax]		; load next row of data from ref block

		psadbw	mm0, [esi]			; mm0 = |ref - cur|
		psadbw	mm2, [esi+eax]		; mm0 = |ref2 - cur2|
		
		add	esi, ecx
		paddusw mm6, mm0			; sum += mm0

		add	edi, ecx
		paddusw mm6, mm2			; sum += mm2

		pop 	edi
		movd eax, mm6

		pop 	esi

		ret



;===========================================================================
;
; uint32_t get_cbp_mmx(const int16_t coeff[6][64]);
;===========================================================================

align 64
global _get_cbp_mmx
_get_cbp_mmx
				push	ebx
				push	esi

                mov     esi, [esp + 8 + 4]          ; coeff
				xor		eax, eax					; reset cbp

				movq	mm2, [mask_dc]
				mov		ecx, 6

align 8
cbp_loop:
                movq	mm0, [esi]
                movq	mm1, [esi+8]

				pand	mm0, mm2
                por     mm0, [esi+16]

                por     mm1, [esi+24]
                por     mm0, [esi+32]

                por     mm1, [esi+40]
                por     mm0, [esi+48]

                por     mm1, [esi+56]
                por     mm0, [esi+64]

                por     mm1, [esi+72]
                por     mm0, [esi+80]

                por     mm1, [esi+88]
                por     mm0, [esi+96]

                por     mm1, [esi+104]
                por     mm0, [esi+112]

                por     mm1, [esi+120]
                por     mm0, mm1

                movq    mm1, mm0
                psrlq   mm1, 32

                por     mm0, mm1
				movd    ebx, mm0

				add		esi, 128
				or		ebx, ebx

				jz		short continue_other_blocks

				; cbp |= 1 << (ecx-1)
				lea		ebx, [ecx-1]
                bts     eax,ebx

continue_other_blocks:
				dec		ecx
				jnz		short cbp_loop

				pop	esi
				pop	ebx
				
				ret


;===========================================================================
;
; uint32_t dev16_mmx(const uint8_t * const cur,
;					const uint32_t stride, uint32_t * const mean);
;===========================================================================

align 64
global _dev16_mmx
_dev16_mmx

		push 	edi
		push 	esi

		mov 	esi, [esp + 8 + 4]	; cur
		mov 	eax, [esp + 8 + 8]	; stride

		pxor	mm6, mm6			; mm6 = 0
		pxor	mm4, mm4			; mm4 = sum = 0	
		pxor	mm5, mm5			; mm5 = sum2 = 0

		mov     edi, esi
		mov		ecx, 16

dev16_avg_loop:
		movq	mm0, [esi]

		movq mm2, [esi + 8]
		movq mm1, mm0

		movq mm3, mm2
		punpcklbw mm0, mm6

		punpckhbw mm1, mm6		
		punpcklbw mm2, mm6

		punpckhbw mm3, mm6		
		paddw mm0, mm1

		paddw mm2, mm3
		paddw mm4, mm0

		paddw mm5, mm2
		add	  esi, eax

		;// next row
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		movq mm1, mm0
		movq mm3, mm2

		punpcklbw mm0, mm6
		punpckhbw mm1, mm6
				
		punpcklbw mm2, mm6
		punpckhbw mm3, mm6
				
		paddw mm0, mm1
		paddw mm2, mm3

		paddw mm4, mm0
		paddw mm5, mm2

		add	  esi, eax
		sub	  ecx, 2				; In each loop two rows are added

		jnz	  dev16_avg_loop

		paddusw	mm4, mm5
		pmaddwd mm4, [mmx_one]	
		
		pxor	mm7, mm7
		movq	mm5, mm4
		
		mov		esi, edi
		psrlq	mm5, 32 

		movq	mm0, [esi]			; first row of data
		paddd	mm5, mm4			; merge sum+sum2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -