⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sad_mmx.asm

📁 MPEG-4编解码的实现(包括MPEG4视音频编解码)
💻 ASM
📖 第 1 页 / 共 2 页
字号:


;===========================================================================
;
; uint32_t sad8_xmm(const uint8_t * const cur,
;					const uint8_t * const ref,
;					const uint32_t stride);
;
; experimental!
;
;===========================================================================
align 16
cglobal sad8_xmm
sad8_xmm
		push 	esi
		push 	edi

		mov 	esi, [esp + 8 + 4]	; ref
		mov 	edi, [esp + 8 + 8]	; cur
		mov 	ecx, [esp + 8 + 12]	; stride
		mov     edx, ecx
		shl     edx, 1

;		mov	eax, 4
		
		pxor mm6, mm6			; mm6 = sum = 0
;.loop
		movq mm0, [esi]			; ref
		movq mm2, [esi+ecx]		; ref2

		psadbw mm0, [edi]		; mm0 = |ref - cur|
		psadbw mm2, [edi+ecx]	; mm0 = |ref2 - cur2|
		
		paddusw mm6,mm0			; sum += mm01
		paddusw mm6,mm2			; sum += mm23

		add	esi, edx
		add	edi, edx
;		dec	eax
;		jnz	.loop

		movq mm0, [esi]
		movq mm2, [esi+ecx]

		psadbw mm0, [edi]
		psadbw mm2, [edi+ecx]
		
		paddusw mm6,mm0
		paddusw mm6,mm2

		add	esi, edx
		add	edi, edx

		movq mm0, [esi]
		movq mm2, [esi+ecx]

		psadbw mm0, [edi]
		psadbw mm2, [edi+ecx]
		
		paddusw mm6,mm0
		paddusw mm6,mm2

		add	esi, edx
		add	edi, edx

		movq mm0, [esi]
		movq mm2, [esi+ecx]

		psadbw mm0, [edi]
		psadbw mm2, [edi+ecx]
		
		paddusw mm6,mm0
		paddusw mm6,mm2

		movd eax, mm6

		pop 	edi
		pop 	esi

		ret



;===========================================================================
;
; uint32_t dev16_mmx(const uint8_t * const cur,
;					const uint32_t stride);
;
;===========================================================================

align 16
cglobal dev16_mmx
dev16_mmx

		push 	esi
		push 	edi

		pxor mm4, mm4			; mm23 = sum = 0
		pxor mm5, mm5

		mov 	esi, [esp + 8 + 4]	; cur
		mov 	ecx, [esp + 8 + 8]	; stride
		mov     edi, esi

		mov	eax, 16
		pxor mm7, mm7			; mm7 = 0
.loop1
		movq mm0, [esi]
		movq mm2, [esi + 8]

		movq mm1, mm0
		movq mm3, mm2

		punpcklbw mm0, mm7
		punpcklbw mm2, mm7

		punpckhbw mm1, mm7		
		punpckhbw mm3, mm7		

		paddw mm0, mm1
		paddw mm2, mm3

		paddw mm4, mm0
		paddw mm5, mm2

		add	esi, ecx
		dec	eax
		jnz	.loop1

		paddusw	mm4, mm5
		pmaddwd mm4, [mmx_one]	; merge sum
		movq mm5, mm4
		psrlq mm5, 32 
		paddd mm4, mm5

		psllq mm4, 32			; blank upper dword
		psrlq mm4, 32 + 8		; mm4 /= (16*16)

		punpckldq mm4, mm4		
		packssdw mm4, mm4		; mm4 = mean

		pxor mm6, mm6			; mm6 = dev = 0
		mov	eax, 16
.loop2
		movq mm0, [edi]
		movq mm2, [edi + 8]

		movq mm1, mm0
		movq mm3, mm2

		punpcklbw mm0, mm7
		punpcklbw mm2, mm7

		punpckhbw mm1, mm7		; mm01 = cur
		punpckhbw mm3, mm7		; mm23 = cur2

		movq mm5, mm4			;
		psubusw mm5, mm0		;
		psubusw mm0, mm4		;
		por mm0, mm5			;
		movq mm5, mm4			;
		psubusw mm5, mm1		;
		psubusw mm1, mm4		;
		por mm1, mm5			; mm01 = |mm01 - mm4|


		movq mm5, mm4			;
		psubusw mm5, mm2		;
		psubusw mm2, mm4		;
		por mm2, mm5			;

		movq mm5, mm4			;
		psubusw mm5, mm3		;
		psubusw mm3, mm4		;
		por mm3, mm5			; mm23 = |mm23 - mm4|

		paddw mm0, mm1
		paddw mm2, mm3

		paddw mm6, mm0
		paddw mm6, mm2			; dev += mm01 + mm23

		add	edi, ecx
		dec	eax
		jnz	.loop2

		pmaddwd mm6, [mmx_one]	; merge dev
		movq mm7, mm6
		psrlq mm7, 32 
		paddd mm6, mm7
		movd eax, mm6

		pop 	edi
		pop 	esi

		ret



;===========================================================================
;
; uint32_t dev16_xmm(const uint8_t * const cur,
;					const uint32_t stride);
;
; experimental!
;
;===========================================================================

align 16
cglobal dev16_xmm
dev16_xmm

		push 	esi
		push 	edi

		pxor mm4, mm4			; mm23 = sum = 0

		mov 	esi, [esp + 8 + 4]	; cur
		mov 	ecx, [esp + 8 + 8]	; stride
		mov     edi, esi

;		mov	eax, 16
		pxor mm7, mm7			; mm7 = 0
;.loop1
		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7			; abs(cur0 - 0) + abs(cur1 - 0) + ... + abs(cur7 - 0) -> mm0
		psadbw mm2, mm7			; abs(cur8 - 0) + abs(cur9 - 0) + ... + abs(cur15 - 0) -> mm2

		paddw mm4,mm0			; mean += mm0
		paddw mm4,mm2			; mean += mm2

		add	esi, ecx
;		dec	eax
;		jnz	.loop1

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		add	esi, ecx

		movq mm0, [esi]
		movq mm2, [esi + 8]

		psadbw mm0, mm7
		psadbw mm2, mm7

		paddw mm4,mm0
		paddw mm4,mm2

		movq mm5, mm4
		psllq mm5, 32 
		paddd mm4, mm5

		psrld mm4, 8
		packssdw mm4, mm4
		packuswb mm4, mm4

		pxor mm6, mm6			; mm6 = dev = 0
;		mov	eax, 16
;.loop2
		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4			; mm0 = |cur - mean|
		psadbw mm2, mm4			; mm0 = |cur2 - mean|
	
		paddw mm6,mm0			; dev += mm01
		paddw mm6,mm2			; dev += mm23

		add	edi, ecx
;		dec	eax
;		jnz	.loop2

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		add	edi, ecx

		movq mm0, [edi]
		movq mm2, [edi + 8]

		psadbw mm0, mm4
		psadbw mm2, mm4
	
		paddw mm6,mm0
		paddw mm6,mm2

		movq mm7, mm6
		psllq mm7, 32 
		paddd mm6, mm7
		movd eax, mm6

		pop 	edi
		pop 	esi

		ret

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -