⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idctchen.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include "StdAfx.h"
#pragma warning(once:4305 4244)

/*
		IDCT routine in MMX (Chen algorithm).

		Syntax: idct_chen_mmx1(short int *data)

		where data is input block of 64 short int (16 bit integers).

		Accuracy: 
		Although this routine is not IEEE-1180 compliant, the accuracy
		should be enough for normal usage in image and video compression.
		For IEEE-1180 compliance, there are 6 tests and 3 ranges of input data.
		The following table is the results for the test corresponding to
		the largest range of data (-300 --> 300). 

	Test		Requirement 			idct_chen_mmx1	idct_chen_mmx2	
		------------------------------------------------------
		AZ						0								0								0												
		PPE 					<=1 							1								1								
		PMSE			<=.06					.05 							.02 							
		PME 					<=.015					.03 							.0146					
		OMSE			<=.02					.035					.018					
		OME 					<=.0015 				.010					.003					

		Thus idct_chen_mmx1 passes 3 tests, idct_chen_mmx2 passes 5 tests (this
		routine is a little bit slower, and still in the process of simulation)
		These two routines manipulate data in 16 bits integer format.
		
		If input data is integer (32 bits) instead of short int (16 bits),
		the routine can be modified to become fully IEEE-1180 compliant (trivial
		and boring task), but the penalty is that it's also twice as slow.
	

	Comments and/or recommendations should be directed to
		knguyen@ece.ubc.ca		
		http://www.ece.ubc.ca/~knguyen/software/idct_chen_mmx1.c

	MPEG2AVI
	--------
	 v0.16B33 code has been hand-scheduled (instruction reordering.)
	    This will yield better performance on Pentium/MMX CPUs, but won't
		help dynamic-execution CPUs (K6/2 and later, PentiumII/Celeron)

	v0.16B22 Chen MMX16 IDCT imported into MPEG2AVI
*/

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define idct_chen_mmx1_LS1		4
#define idct_chen_mmx1_RS8		idct_chen_mmx1_LS1
#define idct_chen_mmx1_LS8		(idct_chen_mmx1_RS8-1)

const static __int64 idct_chen_mmx1_c0		=0x5A825A825A825A82;
const static __int64 idct_chen_mmx1_c1		=0x7D8A7D8A7D8A7D8A;
const static __int64 idct_chen_mmx1_c2		=0x6A6E6A6E6A6E6A6E;
const static __int64 idct_chen_mmx1_c3		=0x471D471D471D471D;
const static __int64 idct_chen_mmx1_c4		=0x18F918F918F918F9;
const static __int64 idct_chen_mmx1_c5		=0x7642764276427642;
const static __int64 idct_chen_mmx1_c6		=0x30FC30FC30FC30FC;

const static __int64 idct_chen_mmx1_c0c		=0x0001000100010001;
const static __int64 idct_chen_mmx1_c1c		=0x0001000100010001;
const static __int64 idct_chen_mmx1_c2c		=0x0001000100010001;
const static __int64 idct_chen_mmx1_c3c		=0x0002000200020002;
const static __int64 idct_chen_mmx1_c4c		=0x0006000600060006;
const static __int64 idct_chen_mmx1_c5c		=0x0001000100010001;
const static __int64 idct_chen_mmx1_c6c		=0x0003000300030003;


void idct_chen(short int *x)				// x should be (32)align
{ 
		__asm{


		mov 	ebx, [x]

//#define mm0 mm0 
//#define mm1 mm1
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
//#define mm5 mm5
//#define mm6 mm6
//#define mm7 mm7
				
		movq	mm0, [ebx+16*2];		// mm0=b2				mm0

		movq	mm1, [ebx+48*2];		// mm1=b3				mm1
		 psllw	 mm0, idct_chen_mmx1_LS1;
		movq	mm2, idct_chen_mmx1_c6c;		//				mm2
		 psllw	 mm1, idct_chen_mmx1_LS1;
		movq	mm3, idct_chen_mmx1_c5c;		//				mm3
		 movq	mm4, mm0; 					// mm4=mm0=b2 mm4

		paddsw	mm0, mm2; 					// b2+c6c
		 movq	mm5, mm1; 					// mm5=mm1=b3 mm5
		movq	mm6, idct_chen_mmx1_c6;		//				mm6
		 paddsw	mm1, mm2; 					// b3+c6c								mm2

		pmulhw	mm1, mm6; 					//
		 paddsw	mm4, mm3; 					// b2+c5c

		movq	mm7, idct_chen_mmx1_c5;		//				mm7
		 pmulhw	mm4, mm7; 					//
		paddsw	mm5, mm3; 					// b3+c5c								mm3
		 pmulhw	mm0, mm6; 					//												mm6
		movq	mm3, [ebx+32*2];		// mm3=b1				mm3
		 pmulhw	mm5, mm7; 					//												mm7
		movq	mm2, [ebx+0*2];		// mm2=b0				mm2
		 paddsw	mm1, mm4; 					// mm1=a3								mm4
		movq	mm4, idct_chen_mmx1_c0c;		//				mm4
		 psllw	mm3, idct_chen_mmx1_LS1;
		psllw	mm2, idct_chen_mmx1_LS1;


//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4

		 psubsw	mm0,mm5;						// mm0=a2								mm5

		
		movq	mm6, idct_chen_mmx1_c0;		//				mm6
		 paddsw	mm2, mm4; 					// b0+c0c
//#define mm6 mm6

		pmulhw	mm2, mm6;
		 paddsw	mm3, mm4; 					// b1+c0c								mm4					
		pmulhw	mm3, mm6; 					//												mm6
		 ;//slot
//#define mm7 mm7
		movq	mm6, [ebx+8*2]; 		// mm6=a0				mm6 			
		 ;//slot

		psllw	mm6, idct_chen_mmx1_LS1;
		 movq	mm7, mm2; 					//								mm7
		
		paddsw	mm2, mm3; 					// mm2=a0
		 psubsw	mm7, mm3; 					// mm7=a1								mm3
																// a(0,1,2,3) in ra(24,28,16,17)
//#define mm5 mm5
		movq	mm5, mm2; 					// mm5=mm2=a0 mm5					
		 paddsw	mm2, mm1; 					// mm2=b0
		psubsw	mm5, mm1; 					// mm5=b3								mm1
		 movq	mm4, mm7; 					// mm4=mm7=a1 mm4					
		movq	[ebx+0*2], mm2;		//												mm2
		 paddsw	mm7, mm0; 					// mm7=b1
		movq	[ebx+48*2], mm5;		//												mm5

//#define mm4 mm4
//		movq	mm4, mm7; 					// mm4=mm7=a1 mm4					
//		paddsw	mm7, mm0; 					// mm7=b1
		 psubsw	mm4, mm0; 					// mm4=b2								mm0
		movq	[ebx+16*2], mm7;		//												mm7
		 movq	mm5, mm6;						// mm5=mm6=a0	mm5
//		movq	[ebx+32*2], mm4;		//												mm4
																// b(0,1,2,3) are stored back
								
//#define mm6 mm6
//		movq	mm6, [ebx+8*2]; 		// mm6=a0				mm6 			
//		psllw	mm6, idct_chen_mmx1_LS1;

//#define mm3 mm3
		movq	mm3, [ebx+56*2];		// mm3=a3				mm3
		 ;//slot
		movq	[ebx+32*2], mm4;		//												mm4
		 psllw	mm3, idct_chen_mmx1_LS1;

//#define mm2 mm2
//#define mm1 mm1
//#define mm5 mm5
//#define mm7 mm7
		movq	mm2, idct_chen_mmx1_c4c;		//				mm2
		 movq	mm7, mm3;						// mm7=mm3=a3	mm7
		movq	mm1, idct_chen_mmx1_c1c;		//				mm1

//		movq	mm5, mm6;						// mm5=mm6=a0	mm5

		 paddsw	mm6, mm2;						// a0+c4c
		movq	mm0, idct_chen_mmx1_c4; 		//				mm0
		 paddsw	mm3, mm2;						// a3+c4c								mm2
		pmulhw	mm3, mm0;						//
		 paddsw	mm5, mm1;						// a0+c1c
//		paddsw	mm7, mm1;						// a3+c1c								mm1

//#define mm0 mm0
//#define mm4 mm4
//		movq	mm0, idct_chen_mmx1_c4; 		//				mm0
		movq	mm4, idct_chen_mmx1_c1; 		//				mm4
		 pmulhw	mm5, mm4;						//
		paddsw	mm7, mm1;						// a3+c1c								mm1
		 pmulhw	mm6, mm0;						//												mm0

//		pmulhw	mm3, mm0;						//

		movq	mm1, [ebx+40*2];		// mm1=a2				mm1
		 pmulhw	mm7, mm4;						//												mm4

		movq	mm2, [ebx+24*2];		// mm2=a1				mm2 			
		 psllw	mm1, idct_chen_mmx1_LS1;

		paddsw	mm3,mm5;						// mm3=c3								mm5
//		psubsw	mm6,mm7;						// mm6=c0								mm7


//#define mm2 mm2
//#define mm1 mm1
//		movq	mm2, [ebx+24*2];		// mm2=a1				mm2 			
		 psllw	mm2, idct_chen_mmx1_LS1;
		movq	mm5, idct_chen_mmx1_c2c;		//				mm5
		 psubsw	mm6,mm7;						// mm6=c0								mm7

//		movq	mm1, [ebx+40*2];		// mm1=a2				mm1
//		psllw	mm1, idct_chen_mmx1_LS1;
//#define mm5 mm5
//#define mm0 mm0

//		movq	mm5, idct_chen_mmx1_c2c;		//				mm5
		movq	mm0, idct_chen_mmx1_c3c;		//				mm0

//#define mm4 mm4
//#define mm7 mm7
		 movq	mm4, mm2;						// mm4=mm2=a1	mm4
		movq	mm7, mm1;						// mm7=mm1=a2	mm7

		 paddsw	mm2, mm5;						// a1+c2c
		paddsw	mm1, mm5;						// a2+c2c								mm5
		 paddsw	mm4, mm0; 					// a1+c3c
		movq	mm5, idct_chen_mmx1_c2;		//				mm5
		 paddsw	mm7, mm0; 					// a2+c3c								mm0

//#define mm5 mm5
//#define mm0 mm0
//		movq	mm5, idct_chen_mmx1_c2;		//				mm5
		movq	mm0, idct_chen_mmx1_c3;		//				mm0
		 pmulhw	mm1, mm5;						//
		pmulhw	mm4, mm0; 					//
		 ;//slot
		pmulhw	mm7, mm0; 					//												mm0
		 ;//slot
		movq	mm0, idct_chen_mmx1_c0c;		//				mm0
		 pmulhw	mm2, mm5;						//												mm5
		movq	mm5, mm3;						// mm5=mm3=c3	mm5
		 psubsw	mm1, mm4;						// mm1=c1								mm4

//		paddsw	mm2, mm7;						// mm2=c2								mm7

																// c(0,1,2,3) in ra(0,9,8,1)


//#define mm4 mm4
//#define mm5 mm5
//#define mm0 mm0
		movq	mm4, mm6;						// mm4=mm6=c0	mm4
		 paddsw	mm6, mm1;						// mm6=a0
		paddsw	mm2, mm7;						// mm2=c2								mm7
		 psubsw	mm4, mm1;						// mm4=a1								mm1
//		movq	mm5, mm3;						// mm5=mm3=c3	mm5
		movq	mm7, idct_chen_mmx1_c0;		//				mm7
		 paddsw	mm3, mm2;						// mm3=a3 
		psubsw	mm5, mm2;						// mm5=a2								mm2
																// a(0,1,2,3) in ra(0,31,32,1)


//		movq	mm0, idct_chen_mmx1_c0c;		//				mm0
		 psllw	mm4, 1
		psllw	mm5, 1
		 paddsw	mm4, mm0; 					// a1+c0c
		paddsw	mm5, mm0; 					// a2+c0c								mm0
//#define mm7 mm7
//#define mm1 mm1
//#define mm2 mm2
//#define mm0 mm0
		
//		movq	mm7, idct_chen_mmx1_c0;		//				mm7
		 pmulhw	mm4, mm7; 					//												mm7
		movq	mm2, [ebx+0*2];		// mm2=b0				mm2
		 pmulhw	mm5, mm7;
		movq	mm0, mm2; 					//								mm0
		 paddsw	mm2, mm3;						// mm2=y0
		movq	mm7, [ebx+16*2];		// mm7=b1				mm7
		 psubsw	mm0, mm3;						// mm0=y7								mm3
		movq	mm1, mm5; 					//								mm1
		 psubsw	mm5, mm4; 					// mm5=c1
		paddsw	mm1, mm4; 					// mm1=c2								mm4
		 movq	mm4, mm7; 					//								mm4
		movq	[ebx+0*2], mm2;		//												mm2
		 paddsw	mm7, mm1; 					// mm7=y1																// c(0,1,2,3) in ra(0,32,35,1)
		movq	[ebx+56*2], mm0;		//												mm0


//#define mm7 mm7
//#define mm4 mm4
//#define mm2 mm2
//		movq	mm7, [ebx+16*2];		// mm7=b1				mm7
//		 movq	mm4, mm7; 					//								mm4
//		paddsw	mm7, mm1; 					// mm7=y1
		 psubsw	mm4, mm1; 					// mm4=y6								mm1
		movq	[ebx+8*2], mm7;		//												mm7
		 ;//slot
		movq	mm2, [ebx+48*2];		// mm2=b3				mm2
		movq	[ebx+48*2], mm4;		//												mm4

//#define mm3 mm3
//#define mm0 mm0
//#define mm7 mm7
		 movq	mm3, mm2; 					//								mm3
		movq	mm0, [ebx+32*2];		// mm0=b2				mm0
		 paddsw	mm2, mm6;						// mm2=y3
		psubsw	mm3, mm6;						// mm3=y4								mm6
		 movq	mm7, mm0; 					//								mm7
		movq	[ebx+24*2], mm2;		//												mm2
		 paddsw	mm0, mm5; 					// mm0=y2
		movq	[ebx+32*2], mm3;		//												mm3
		 psubsw	mm7, mm5; 					// mm7=y5								mm5
		movq	[ebx+16*2], mm0;		//												mm0
		;//slot
		movq	mm0, [ebx+16*2+8]; 	// mm0=b2				mm0
		;//slot
		movq	[ebx+40*2], mm7;		//												mm7


//#define mm0 mm0
//#define mm1 mm1
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
//#define mm5 mm5
//#define mm6 mm6
//#define mm7 mm7
//		movq	mm0, [ebx+16*2+8]; 	// mm0=b2				mm0
		 psllw	mm0, idct_chen_mmx1_LS1;
		movq	mm1, [ebx+48*2+8]; 	// mm1=b3				mm1
		 movq	mm4, mm0; 					// mm4=mm0=b2 mm4
		movq	mm2, idct_chen_mmx1_c6c;		//				mm2
		 psllw	mm1, idct_chen_mmx1_LS1;
		movq	mm3, idct_chen_mmx1_c5c;		//				mm3


		 movq	mm5, mm1; 					// mm5=mm1=b3 mm5
		paddsw	mm0, mm2; 					// b2+c6c
		 paddsw	mm1, mm2; 					// b3+c6c								mm2

		movq	mm6, idct_chen_mmx1_c6;		//				mm6
		 paddsw	mm4, mm3; 					// b2+c5c
		movq	mm7, idct_chen_mmx1_c5;		//				mm7
		 pmulhw	mm1, mm6; 					//
		paddsw	mm5, mm3; 					// b3+c5c								mm3
		 pmulhw	mm4, mm7; 					//
		movq	mm2, [ebx+0*2+8];		// mm2=b0				mm2
		 pmulhw	mm0, mm6; 					//												mm6
		pmulhw	mm5, mm7; 					//												mm7
		 psllw	mm2, idct_chen_mmx1_LS1;
//		paddsw	mm1, mm4; 					// mm1=a3								mm4


//		 psubsw	mm0,mm5;						// mm0=a2								mm5
//#define mm2 mm2
//#define mm3 mm3
//#define mm4 mm4
//		movq	mm2, [ebx+0*2+8];		// mm2=b0				mm2
//		psllw	mm2, idct_chen_mmx1_LS1;

		movq	mm3, [ebx+32*2+8]; 	// mm3=b1				mm3
		 paddsw	mm1, mm4; 					// mm1=a3								mm4
		movq	mm4, idct_chen_mmx1_c0c;		//				mm4
		 psllw	mm3, idct_chen_mmx1_LS1;

		
		movq	mm6, idct_chen_mmx1_c0;		//				mm6
		 paddsw	mm2, mm4; 					// b0+c0c
//		psubsw	mm0,mm5;						// mm0=a2								mm5
//		paddsw	mm3, mm4; 					// b1+c0c								mm4					
//#define mm6 mm6
//		movq	mm6, idct_chen_mmx1_c0;		//				mm6
		pmulhw	mm2, mm6;
		 paddsw	mm3, mm4; 					// b1+c0c								mm4					
		pmulhw	mm3, mm6; 					//												mm6
		 psubsw	mm0,mm5;						// mm0=a2								mm5
//#define mm7 mm7
		movq	mm6, [ebx+8*2+8];				// mm6=a0				mm6 			
		 ;//stall
		movq	mm7, mm2; 					//								mm7
		 paddsw	mm2, mm3; 					// mm2=a0
		psubsw	mm7, mm3; 					// mm7=a1								mm3
																// a(0,1,2,3) in rb(24,28,16,17)
//#define mm5 mm5
		 movq	mm5, mm2; 					// mm5=mm2=a0 mm5					
		paddsw	mm2, mm1; 					// mm2=b0
		 psubsw	mm5, mm1; 					// mm5=b3								mm1
		movq	mm4, mm7; 					// mm4=mm7=a1 mm4					
		 paddsw	mm7, mm0; 					// mm7=b1
		movq	[ebx+0*2+8], mm2;		//												mm2
		 psllw	mm6, idct_chen_mmx1_LS1;
		movq	[ebx+48*2+8], mm5; 	//												mm5

//#define mm4 mm4
//		movq	mm4, mm7; 					// mm4=mm7=a1 mm4					
//		paddsw	mm7, mm0; 					// mm7=b1
		 psubsw	mm4, mm0; 					// mm4=b2								mm0
		movq	[ebx+16*2+8], mm7; 	//												mm7
		 ;//stall
//		movq	[ebx+32*2+8], mm4; 	//												mm4
																// b(0,1,2,3) are stored back
								
//#define mm6 mm6
//		movq	mm6, [ebx+8*2+8];				// mm6=a0				mm6 			
//		psllw	mm6, idct_chen_mmx1_LS1;

//#define mm3 mm3
		movq	mm3, [ebx+56*2+8];		// mm3=a3				mm3
		 ;//stall
		movq	[ebx+32*2+8], mm4; 	//												mm4
		 psllw	mm3, idct_chen_mmx1_LS1;

//#define mm2 mm2
//#define mm1 mm1
//#define mm5 mm5
//#define mm7 mm7
		movq	mm2, idct_chen_mmx1_c4c;		//				mm2
		 movq	mm5, mm6;						// mm5=mm6=a0	mm5
		movq	mm1, idct_chen_mmx1_c1c;		//				mm1

		 movq	mm7, mm3;						// mm7=mm3=a3	mm7

		movq	mm0, idct_chen_mmx1_c4; 		//				mm0
		 paddsw	mm3, mm2;						// a3+c4c								mm2
		movq	mm4, idct_chen_mmx1_c1; 		//				mm4
		 pmulhw	mm3, mm0;						//
		paddsw	mm6, mm2;						// a0+c4c
		 paddsw	mm5, mm1;						// a0+c1c
		paddsw	mm7, mm1;						// a3+c1c								mm1

//#define mm0 mm0
//#define mm4 mm4
//		movq	mm0, idct_chen_mmx1_c4; 		//				mm0
//		movq	mm4, idct_chen_mmx1_c1; 		//				mm4

//		pmulhw	mm3, mm0;						//
		 pmulhw	mm5, mm4;						//
		paddsw	mm3,mm5;						// mm3=c3								mm5
		 pmulhw	mm6, mm0;						//												mm0
		movq	mm2, [ebx+24*2+8];		// mm2=a1				mm2 			
		 pmulhw	mm7, mm4;						//												mm4

//		psubsw	mm6,mm7;						// mm6=c0								mm7


//#define mm2 mm2
//#define mm1 mm1
//		movq	mm2, [ebx+24*2+8];		// mm2=a1				mm2 			
		movq	mm1, [ebx+40*2+8];		// mm1=a2				mm1
		 psllw	mm2, idct_chen_mmx1_LS1;
		movq	mm5, idct_chen_mmx1_c2c;		//				mm5
		 psllw	mm1, idct_chen_mmx1_LS1;
//#define mm5 mm5
//#define mm0 mm0

//		movq	mm5, idct_chen_mmx1_c2c;		//				mm5
		movq	mm0, idct_chen_mmx1_c3c;		//				mm0
		 psubsw	mm6,mm7;						// mm6=c0								mm7

//#define mm4 mm4
//#define mm7 mm7
		movq	mm4, mm2;						// mm4=mm2=a1	mm4
		 movq	mm7, mm1;						// mm7=mm1=a2	mm7

		paddsw	mm2, mm5;						// a1+c2c
		 paddsw	mm1, mm5;						// a2+c2c								mm5
		movq	mm5, idct_chen_mmx1_c2;		//				mm5
		 paddsw	mm4, mm0; 					// a1+c3c
		paddsw	mm7, mm0; 					// a2+c3c								mm0

//#define mm5 mm5
//#define mm0 mm0
//		movq	mm5, idct_chen_mmx1_c2;		//				mm5
		 pmulhw	mm1, mm5;						//
		movq	mm0, idct_chen_mmx1_c3;		//				mm0
		 pmulhw	mm2, mm5;						//												mm5
		pmulhw	mm4, mm0; 					//

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -