⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mefunctions_mmx.asm

📁 经典的MP4编解码核心库
💻 ASM
📖 第 1 页 / 共 3 页
字号:

		psllq	mm5, 32			
		psrlq	mm5, 32 + 8			; get mean of MB

		punpckldq mm5, mm5
		mov		ecx, 16

		packssdw mm5, mm5			; mm5 = mean; scatter mean into four words

loop4dev:
		movq mm2, [esi + 8]			; second half of data in the same row

		movq mm1, mm0
		movq mm3, mm2

		punpcklbw mm0, mm6			; byte -> word
		punpcklbw mm2, mm6

		punpckhbw mm1, mm6			; mm01 = cur
		movq mm4, mm5				; mm45 = mean

		punpckhbw mm3, mm6			; mm23 = cur2
		psubusw	mm4, mm0			

		psubusw mm0, mm5		
		por		mm0, mm4			; mm0 = |mm0-mean|

		movq	mm4, mm5			
		paddw	mm7, mm0			; dev += mm0

		psubusw mm4, mm1		
		psubusw mm1, mm5		

		por mm1, mm4				; mm1 = |mm1 - mm4|
		movq mm4, mm5			

		paddw	mm7, mm1			; dev += mm1
		psubusw mm4, mm2		

		psubusw mm2, mm5		
		por		mm2, mm4			; mm2 = |mm2 - mean|

		movq	mm4, mm5			
		paddw	mm7, mm2			; dev += mm2

		psubusw mm4, mm3
		psubusw mm3, mm5
					
		por		mm3, mm4			; mm3 = |mm3 - mean|
		add		esi, eax

		paddw	mm7, mm3			; dev += mm3

		movq mm0, [esi]
		movq mm2, [esi + 8]			; second half of data in the same row

		movq mm1, mm0
		movq mm3, mm2

		punpcklbw mm0, mm6			; byte -> word
		punpcklbw mm2, mm6

		punpckhbw mm1, mm6			; mm01 = cur
		movq mm4, mm5				; mm45 = mean

		punpckhbw mm3, mm6			; mm23 = cur2
		psubusw	mm4, mm0			

		psubusw mm0, mm5		
		por		mm0, mm4			; mm0 = |mm0-mean|

		movq	mm4, mm5			
		paddw	mm7, mm0			; dev += mm0

		psubusw mm4, mm1		
		psubusw mm1, mm5		

		por mm1, mm4				; mm1 = |mm1 - mm4|
		movq mm4, mm5			

		paddw	mm7, mm1			; dev += mm1
		psubusw mm4, mm2		

		psubusw mm2, mm5		
		por		mm2, mm4			; mm2 = |mm2 - mean|

		movq	mm4, mm5			
		paddw	mm7, mm2			; dev += mm2

		psubusw mm4, mm3
		psubusw mm3, mm5
					
		por		mm3, mm4			; mm3 = |mm3 - mean|
		add		esi, eax

		paddw	mm7, mm3			; dev += mm3
		sub		ecx, 2

		movq	mm0, [esi]			; load data
		jnz		loop4dev

		pmaddwd mm7, [mmx_one]		; merge dev
		mov	esi, [esp+8+12]

		movq mm0, mm7
		punpcklwd mm5, mm6

		psrlq mm0, 32		 
		movd	[esi], mm5

		paddd mm7, mm0
		pop 	esi

		movd eax, mm7
		pop 	edi

		ret



;===========================================================================
;
; uint32_t dev16_xmm(const uint8_t * const cur,
;					 const uint32_t stride, 
;					 uint32_t * const mean);
;
;===========================================================================

align 64
global _dev16_xmm
_dev16_xmm

		push 	edi
		push 	esi

		mov 	esi, [esp + 8 + 4]	; cur
		mov 	eax, [esp + 8 + 8]	; stride

		pxor	mm6, mm6			; mm6 = 0
		pxor	mm4, mm4			; mm4 = sum = 0
		pxor	mm5, mm5			; mm5 = sum2 = 0

		mov     edi, esi
		mov		ecx, 16

		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//2
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//4
						
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//6
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//8
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//10

		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//12
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		add		esi, eax
		paddusw	mm5, mm0

		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi]

		movq	mm2, [esi + 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		add		esi, eax
		paddusw	mm4, mm2

		;// next two rows ...		//14
		movq	mm0, [esi]
		movq	mm2, [esi + 8]

		psadbw	mm0, mm6
		psadbw	mm2, mm6

		paddusw	mm5, mm0
		paddusw	mm4, mm2
		;// next row
		movq	mm0, [esi+eax]

		movq	mm2, [esi +eax+ 8]
		psadbw	mm0, mm6

		psadbw	mm2, mm6
		paddusw	mm5, mm0

		paddusw	mm4, mm2

		;// next two rows ...		//16, done!!!

		pxor	mm7, mm7
		paddusw	mm5, mm4

		mov		esi, edi
		psllq	mm5, 32				; clean left 32 bits

		movq	mm0, [esi]			; first row of data
		psrlq	mm5, 32 + 8			; get mean of MB

		mov		ecx, eax
		punpckldq mm5, mm5

		shl		ecx, 1				; ecx = 2*stride
		movq mm2, [esi + 8]			; second half of data in the same row

		packssdw mm5, mm5			; mm5 = mean; scatter mean into four words
		movq	mm1, [esi+eax]		; load data in next row into mm13

		packuswb mm5, mm5
		movq	mm3, [esi+eax+8]

		psadbw	mm0, mm5			; mm0 = |mm0-mean|
		psadbw	mm2, mm5			; mm2 = |mm2-mean|

		paddusw	mm7, mm0			; dev += mm0
		paddusw	mm6, mm2			; dev += mm1

		psadbw  mm1, mm5
		psadbw	mm3, mm5

		add		esi, ecx			; start to work on 2nd and 3rd row ...
		paddusw	mm6, mm1			; accumulate mm6, mm7

		paddusw	mm7, mm3
		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		add		esi, ecx			; start to work on 4th and 5th row ...

		paddusw	mm6, mm1			; accumulate mm6, mm7
		paddusw	mm7, mm3

		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		add		esi, ecx			; start to work on 6th and 7th row ...

		paddusw	mm6, mm1			; accumulate mm6, mm7
		paddusw	mm7, mm3

		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		add		esi, ecx			; start to work on 8th and 9th row ...

		paddusw	mm6, mm1			; accumulate mm6, mm7
		paddusw	mm7, mm3

		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		add		esi, ecx			; start to work on 10th and 11th row ...

		paddusw	mm6, mm1			; accumulate mm6, mm7
		paddusw	mm7, mm3

		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		add		esi, ecx			; start to work on 12th and 13th row ...

		paddusw	mm6, mm1			; accumulate mm6, mm7
		paddusw	mm7, mm3

		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		paddusw	mm6, mm1			; accumulate mm6, mm7

		add		esi, ecx			; start to work on 14th and 15th rows
		paddusw	mm7, mm3

		movq	mm0, [esi]

		movq	mm2, [esi+8]
		psadbw	mm0, mm5			; mm0 = |mm0-mean|

		psadbw	mm2, mm5			; mm2 = |mm2-mean|
		movq	mm1, [esi+eax]		; load data in next row into mm13
		paddusw	mm7, mm0			; dev += mm0
		movq	mm3, [esi+eax+8]

		paddusw	mm6, mm2			; dev += mm1
		psadbw  mm1, mm5

		psadbw	mm3, mm5
		paddusw	mm6, mm1			; accumulate mm6, mm7

		paddusw	mm7, mm3
		pxor	mm0, mm0

		mov		edi, [esp+8+12]

		punpcklwd mm5, mm0
		paddusw	mm7, mm6

		movd	[edi], mm5			; return mean
		pop 	esi

		movd	eax, mm7			; sad
		pop 	edi

		ret

;===========================================================================
;
; uint32_t dev16_sse2(const uint8_t * const cur,
;					 const uint32_t stride, 
;					 uint32_t * const mean);
;
;===========================================================================

align 64
global _dev16_sse2
_dev16_sse2

		push 	edi
		push 	esi

		mov 	esi, [esp + 8 + 4]	; cur
		mov 	eax, [esp + 8 + 8]	; stride

		pxor	xmm5, xmm5
		mov		ecx, eax

		pxor	xmm6, xmm6
		shl		ecx, 1				; ecx = 2*stride
		
		mov		edi, esi
		movdqa	xmm0, [esi]			; 1st row

		movdqa	xmm2, [esi+eax]		; 2nd row
		psadbw	xmm0, xmm6

		psadbw	xmm2, xmm6
		add		esi, ecx			; start working on 3rd and 4th row

		movdqa	xmm1, [esi]		
		paddw	xmm5, xmm0

		movdqa	xmm3, [esi+eax]
		paddw	xmm5, xmm2

		psadbw	xmm1, xmm6
		psadbw	xmm3, xmm6

		add		esi, ecx			; start working on 5th and 6th row

		movdqa	xmm0, [esi]		
		paddw	xmm5, xmm1

		movdqa	xmm2, [esi+eax]
		paddw	xmm5, xmm3

		psadbw	xmm0, xmm6
		psadbw	xmm2, xmm6

		add		esi, ecx			; start working on 7th and 8th row

		movdqa	xmm1, [esi]		
		paddw	xmm5, xmm0

		movdqa	xmm3, [esi+eax]
		paddw	xmm5, xmm2

		psadbw	xmm1, xmm6
		psadbw	xmm3, xmm6

		add		esi, ecx			; start working on 9th and 10th row

		movdqa	xmm0, [esi]		
		paddw	xmm5, xmm1

		movdqa	xmm2, [esi+eax]
		paddw	xmm5, xmm3

		psadbw	xmm0, xmm6
		psadbw	xmm2, xmm6

		add		esi, ecx			; start working on 11th and 12th row

		movdqa	xmm1, [esi]		
		paddw	xmm5, xmm0

		movdqa	xmm3, [esi+eax]
		paddw	xmm5, xmm2

		psadbw	xmm1, xmm6
		psadbw	xmm3, xmm6

		add		esi, ecx			; start working on 13th and 14th row

		movdqa	xmm0, [esi]		
		paddw	xmm5, xmm1

		movdqa	xmm2, [esi+eax]
		paddw	xmm5, xmm3

		psadbw	xmm0, xmm6
		psadbw	xmm2, xmm6

		add		esi, ecx			; start working on 15th and 16th row

		movdqa	xmm1, [esi]		
		paddw	xmm5, xmm0

		movdqa	xmm3, [esi+eax]
		paddw	xmm5, xmm2

		psadbw	xmm1, xmm6
		psadbw	xmm3, xmm6

		paddw	xmm5, xmm1
		add		ecx, eax				; now ecx = 3*stride

		paddw	xmm5, xmm3
		pxor	xmm7, xmm7				; xmm7 = dev = 0

		movhlps	xmm4, xmm5
		paddw	xmm5, xmm4

		movdqa	xmm0, [edi]
		psrlw	xmm5, 8

		movdqa	xmm1, [edi+eax]
		movlhps	xmm5, xmm5

		movdqa	xmm2, [edi+eax*2]
		movdqa	xmm4, xmm5

		movdqa	xmm3, [edi+ecx]
		psllq	xmm4, 32 

		add		edi, eax
		paddd	xmm5, xmm4

		add		edi, ecx					; ready to load next four rows  !!!, from 5th row
		packssdw xmm5, xmm5

		mov		esi, [esp+8+12]				; ready to load value for mean
		packuswb xmm5, xmm5					; scatter mean over xmm4 in byte mode

		psadbw	xmm0, xmm5
		psadbw	xmm1, xmm5

		paddw	xmm7, xmm0
		paddw	xmm6, xmm1

		psadbw	xmm2, xmm5
		psadbw	xmm3, xmm5
		
		movdqa	xmm0, [edi]					; load 5th row
		movdqa	xmm1, [edi+eax]				; load 6th row

		paddw	xmm6, xmm2
		paddw	xmm7, xmm3					; finish accumulation for last four rows

		movdqa	xmm2, [edi+eax*2]			; load 7th row
		movdqa	xmm3, [edi+ecx]				; load 8th row

		add		edi, eax
		psadbw	xmm0, xmm5

		psadbw	xmm1, xmm5
		add		edi, ecx					; ready to load next four rows  !!!, from 9th row
	
		paddw	xmm6, xmm0
		paddw	xmm7, xmm1

		psadbw	xmm2, xmm5
		psadbw	xmm3, xmm5

		paddw	xmm6, xmm2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -